diff mbox

[RFC,RDMA,support,v1:,1/5] add openfabrics RDMA libraries and base RDMA code to build

Message ID 1359410505-4715-1-git-send-email-mrhines@linux.vnet.ibm.com
State New
Headers show

Commit Message

mrhines@linux.vnet.ibm.com Jan. 28, 2013, 10:01 p.m. UTC
From: "Michael R. Hines" <mrhines@us.ibm.com>


Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 Makefile.target     |    5 +-
 include/qemu/rdma.h |  249 ++++++++++
 qemu-rdma.c         | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1609 insertions(+), 2 deletions(-)
 create mode 100644 include/qemu/rdma.h
 create mode 100644 qemu-rdma.c

Comments

Andreas Färber Jan. 30, 2013, 12:50 a.m. UTC | #1
Hello,

Am 28.01.2013 23:01, schrieb mrhines@linux.vnet.ibm.com:
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  Makefile.target     |    5 +-
>  include/qemu/rdma.h |  249 ++++++++++
>  qemu-rdma.c         | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1609 insertions(+), 2 deletions(-)
>  create mode 100644 include/qemu/rdma.h
>  create mode 100644 qemu-rdma.c

This series is missing a cover letter with explanations, starting with
the acronym and its purpose. Further, the commit messages are lacking
descriptions. Any reason that the code is GPLv2 rather than GPLv2+?

Regards,
Andreas
Anthony Liguori Jan. 30, 2013, 2:16 a.m. UTC | #2
Andreas Färber <afaerber@suse.de> writes:

> Hello,
>
> Am 28.01.2013 23:01, schrieb mrhines@linux.vnet.ibm.com:
>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>> 
>> 
>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>> ---
>>  Makefile.target     |    5 +-
>>  include/qemu/rdma.h |  249 ++++++++++
>>  qemu-rdma.c         | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 1609 insertions(+), 2 deletions(-)
>>  create mode 100644 include/qemu/rdma.h
>>  create mode 100644 qemu-rdma.c
>
> This series is missing a cover letter with explanations, starting with
> the acronym and its purpose.

Indeed.  A bit of an introduction of what problem the series solves is
in order.

Can you share performance data with and without RMDA?

> Further, the commit messages are lacking
> descriptions. Any reason that the code is GPLv2 rather than GPLv2+?

I assume copy/paste.  Michael, unless you had an explicit reason to use
GPLv2 only, we now prefer GPLv2+.

Regards,

Anthony Liguori
>
> Regards,
> Andreas
>
> -- 
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
> GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
Stefan Hajnoczi Jan. 30, 2013, 9:44 a.m. UTC | #3
On Mon, Jan 28, 2013 at 05:01:41PM -0500, mrhines@linux.vnet.ibm.com wrote:
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  Makefile.target     |    5 +-
>  include/qemu/rdma.h |  249 ++++++++++
>  qemu-rdma.c         | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1609 insertions(+), 2 deletions(-)
>  create mode 100644 include/qemu/rdma.h
>  create mode 100644 qemu-rdma.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index 760da1e..d1d6b8c 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o
>  obj-y += hw/
>  obj-$(CONFIG_KVM) += kvm-all.o
>  obj-$(CONFIG_NO_KVM) += kvm-stub.o
> -obj-y += memory.o savevm.o cputlb.o
> +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o
> +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o

Please drop tracefunc.o.

>  obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o
>  obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o
>  obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o
>  obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o
> -LIBS+=-lz
> +LIBS+=-lz -lrdmacm

This needs to be a ./configure option.  Not all users will choose to
build with RDMA support and rdmacm may not be available on all host
platforms.

Stefan
Paolo Bonzini Feb. 18, 2013, 11:02 a.m. UTC | #4
Il 28/01/2013 23:01, mrhines@linux.vnet.ibm.com ha scritto:
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  Makefile.target     |    5 +-
>  include/qemu/rdma.h |  249 ++++++++++

Please make this include/migration/rdma.h

Paolo

>  qemu-rdma.c         | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1609 insertions(+), 2 deletions(-)
>  create mode 100644 include/qemu/rdma.h
>  create mode 100644 qemu-rdma.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index 760da1e..d1d6b8c 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o
>  obj-y += hw/
>  obj-$(CONFIG_KVM) += kvm-all.o
>  obj-$(CONFIG_NO_KVM) += kvm-stub.o
> -obj-y += memory.o savevm.o cputlb.o
> +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o
> +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o
>  obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o
>  obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o
>  obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o
>  obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o
> -LIBS+=-lz
> +LIBS+=-lz -lrdmacm
>  
>  # xen support
>  obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o
> diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h
> new file mode 100644
> index 0000000..099622e
> --- /dev/null
> +++ b/include/qemu/rdma.h
> @@ -0,0 +1,249 @@
> +/*
> + * RDMA data structures and helper functions header (for migration)
> + *
> + * Copyright IBM, Corp. 2013
> + *
> + * Authors:
> + *  Michael R. Hines <mrhines@us.ibm.com>
> + *  Jiuxing Liu <jl@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef _QEMU_RDMA_H
> +#define _QEMU_RDMA_H
> +
> +#include <rdma/rdma_cma.h>
> +#include "monitor/monitor.h"
> +
> +extern int rdmaport;
> +extern char rdmahost[64];
> +
> +struct qemu_rdma_context {
> +    /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in
> +       cm_id->verbs, cm_id->channel, and cm_id->qp. */
> +    struct rdma_cm_id *cm_id;
> +    struct rdma_cm_id *listen_id;
> +
> +    struct ibv_context *verbs;
> +    struct rdma_event_channel *channel;
> +    struct ibv_qp *qp;
> +
> +    struct ibv_comp_channel *comp_channel;
> +    struct ibv_pd *pd;
> +    struct ibv_cq *cq;
> +};
> +
> +static inline void qemu_rdma_init_context(struct qemu_rdma_context *rdma_ctx)
> +{
> +    rdma_ctx->cm_id = NULL;
> +    rdma_ctx->listen_id = NULL;
> +    rdma_ctx->verbs = NULL;
> +    rdma_ctx->channel = NULL;
> +    rdma_ctx->qp = NULL;
> +    rdma_ctx->comp_channel = NULL;
> +    rdma_ctx->pd = NULL;
> +    rdma_ctx->cq = NULL;
> +}
> +
> +void cpu_physical_memory_reset_dirty_all(void);
> +
> +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx,
> +        const char *host, int port);
> +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx);
> +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx);
> +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx,
> +        void *in_data, int *in_len, void *out_data, int out_len);
> +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx,
> +        void *in_data, int *in_len, void *out_data, int out_len);
> +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx);
> +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx);
> +
> +/* Instead of registering whole ram blocks, we can register them in smaller
> + * chunks. This may be benefial if the ram blocks have holes in them */
> +#define QEMU_RDMA_CHUNK_REGISTRATION
> +
> +#define QEMU_RDMA_LAZY_REGISTRATION
> +
> +#define QEMU_RDMA_REG_CHUNK_SHIFT 20
> +#define QEMU_RDMA_REG_CHUNK_SIZE (1UL << (QEMU_RDMA_REG_CHUNK_SHIFT))
> +#define QEMU_RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \
> +            (((unsigned long)(host_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT) - \
> +            ((unsigned long)(start_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT))
> +#define QEMU_RDMA_REG_NUM_CHUNKS(rdma_ram_block) \
> +            (QEMU_RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\
> +                (rdma_ram_block)->local_host_addr +\
> +                (rdma_ram_block)->length) + 1)
> +#define QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\
> +            ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \
> +                QEMU_RDMA_REG_CHUNK_SHIFT) + (i)) << \
> +                QEMU_RDMA_REG_CHUNK_SHIFT))
> +#define QEMU_RDMA_REG_CHUNK_END(rdma_ram_block, i) \
> +            (QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) + \
> +             QEMU_RDMA_REG_CHUNK_SIZE)
> +
> +struct qemu_rdma_ram_block {
> +    uint8_t *local_host_addr;
> +    uint64_t remote_host_addr;
> +    uint64_t offset;
> +    uint64_t length;
> +    struct ibv_mr **pmr;
> +    struct ibv_mr *mr;
> +    uint32_t remote_rkey;
> +};
> +
> +struct qemu_rdma_remote_ram_block {
> +    uint64_t remote_host_addr;
> +    uint64_t offset;
> +    uint64_t length;
> +    uint32_t remote_rkey;
> +};
> +
> +#define QEMU_MAX_RAM_BLOCKS 64
> +
> +struct qemu_rdma_ram_blocks {
> +    int num_blocks;
> +    struct qemu_rdma_ram_block block[QEMU_MAX_RAM_BLOCKS];
> +};
> +
> +struct qemu_rdma_remote_ram_blocks {
> +    int num_blocks;
> +    struct qemu_rdma_remote_ram_block block[QEMU_MAX_RAM_BLOCKS];
> +};
> +
> +int qemu_rdma_init_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks);
> +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
> +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
> +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
> +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
> +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks);
> +
> +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
> +        struct qemu_rdma_remote_ram_blocks *remote);
> +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
> +        struct qemu_rdma_remote_ram_blocks *remote);
> +
> +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length,
> +        struct qemu_rdma_ram_blocks *blocks,
> +        int *block_index, int *chunk_index);
> +
> +struct qemu_rdma_migration_data {
> +    char *host;
> +    int port;
> +    int enabled;
> +
> +    struct qemu_rdma_context rdma_ctx;
> +    struct qemu_rdma_ram_blocks rdma_ram_blocks;
> +
> +    /* This is used for synchronization: We use
> +       IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs
> +       are done. When the receiver gets it, it can be certain
> +       that all the RDMAs are completed. */
> +    int sync;
> +    struct ibv_mr *sync_mr;
> +
> +    /* This is used for the server to write the remote
> +       ram blocks info. */
> +    struct qemu_rdma_remote_ram_blocks remote_info;
> +    struct ibv_mr *remote_info_mr;
> +
> +    /* The rest is only for the initiator of the migration. */
> +    int client_init_done;
> +
> +    /* number of outstanding unsignaled send */
> +    int num_unsignaled_send;
> +
> +    /* number of outstanding signaled send */
> +    int num_signaled_send;
> +
> +    /* store info about current buffer so that we can
> +       merge it with future sends */
> +    uint64_t current_offset;
> +    uint64_t current_length;
> +    /* index of ram block the current buffer belongs to */
> +    int current_index;
> +    /* index of the chunk in the current ram block */
> +    int current_chunk;
> +
> +    uint64_t total_bytes;
> +
> +};
> +
> +extern struct qemu_rdma_migration_data rdma_mdata;
> +
> +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata);
> +
> +static inline int qemu_use_rdma_migration(void)
> +{
> +    /* port will be non-zero if user wants to use RDMA. */
> +    return rdma_mdata.port != -1;
> +}
> +
> +static inline int qemu_rdma_migration_enabled(void)
> +{
> +    return rdma_mdata.enabled;
> +}
> +
> +void qemu_rdma_migration_disable(void);
> +
> +#define QEMU_RDMA_MIGRATION_BLOCKING
> +#define QEMU_RDMA_MIGRATION_EXTRA_SYNC
> +
> +enum {
> +    QEMU_RDMA_MIGRATION_WRID_NONE = 0,
> +    QEMU_RDMA_MIGRATION_WRID_RDMA,
> +    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC,
> +    QEMU_RDMA_MIGRATION_WRID_RECV_SYNC,
> +    QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO,
> +    QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO,
> +    QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC,
> +    QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC,
> +};
> +
> +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host,
> +                                                              int port);
> +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata,
> +        int wr_id);
> +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata,
> +        int wr_id);
> +
> +int qemu_rdma_migration_reg_remote_info(
> +        struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_dereg_remote_info(
> +        struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_post_send_remote_info(
> +        struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_post_recv_remote_info(
> +        struct qemu_rdma_migration_data *mdata);
> +
> +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata,
> +        uint64_t addr, uint64_t len);
> +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_wait_for_wrid(
> +        struct qemu_rdma_migration_data *mdata,
> +        int wrid);
> +int qemu_rdma_migration_poll_for_wrid(
> +        struct qemu_rdma_migration_data *mdata,
> +        int wrid);
> +int qemu_rdma_migration_block_for_wrid(
> +        struct qemu_rdma_migration_data *mdata,
> +        int wrid);
> +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata);
> +
> +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata);
> +int qemu_rdma_migration_server_wait_for_client(
> +                                       struct qemu_rdma_migration_data *mdata);
> +
> +#endif
> diff --git a/qemu-rdma.c b/qemu-rdma.c
> new file mode 100644
> index 0000000..5f16875
> --- /dev/null
> +++ b/qemu-rdma.c
> @@ -0,0 +1,1357 @@
> +/*
> + * RDMA data structures and helper functions (for migration)
> + *
> + * Copyright IBM, Corp. 2013
> + *
> + * Authors:
> + *  Michael R. Hines <mrhines@us.ibm.com>
> + *  Jiuxing Liu <jl@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "qemu/rdma.h"
> +#include "qemu-common.h"
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <netdb.h>
> +#include <arpa/inet.h>
> +#include <string.h>
> +
> +#define QEMU_RDMA_RESOLVE_TIMEOUT_MS 10000
> +#define QEMU_RDMA_CQ_SIZE 2000
> +#define QEMU_RDMA_QP_SIZE 1000
> +
> +int rdmaport = -1;
> +char rdmahost[64] = "";
> +struct qemu_rdma_migration_data rdma_mdata;
> +
> +static void *qemu_rdma_mallocz(size_t size)
> +{
> +    void *ptr;
> +    ptr = malloc(size);
> +    memset(ptr, 0, size);
> +    return ptr;
> +}
> +
> +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx,
> +        const char *host, int port)
> +{
> +    int ret;
> +    struct addrinfo *res;
> +    char port_str[16];
> +    struct rdma_cm_event *cm_event;
> +
> +
> +    if (!strcmp(host, "")) {
> +        printf("RDMA hostname has not been set\n");
> +        return -1;
> +    }
> +
> +    /* create CM channel */
> +    rdma_ctx->channel = rdma_create_event_channel();
> +    if (!rdma_ctx->channel) {
> +        printf("could not create CM channel\n");
> +        return -1;
> +    }
> +
> +    /* create CM id */
> +    ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL,
> +            RDMA_PS_TCP);
> +    if (ret) {
> +        printf("could not create channel id\n");
> +        goto err_resolve_create_id;
> +    }
> +
> +    snprintf(port_str, 16, "%d", port);
> +    port_str[15] = '\0';
> +    ret = getaddrinfo(host, port_str, NULL, &res);
> +    if (ret < 0) {
> +        printf("could not getaddrinfo address %s\n", host);
> +        goto err_resolve_get_addr;
> +    }
> +
> +    /* resolve the first address */
> +    ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr,
> +            QEMU_RDMA_RESOLVE_TIMEOUT_MS);
> +    if (ret) {
> +        printf("could not resolve address %s\n", host);
> +        goto err_resolve_get_addr;
> +    }
> +
> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> +    if (ret) {
> +        printf("could not perform event_addr_resolved\n");
> +        goto err_resolve_get_addr;
> +    }
> +    if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
> +        printf("result not equal to event_addr_resolved\n");
> +        rdma_ack_cm_event(cm_event);
> +        goto err_resolve_get_addr;
> +    }
> +    rdma_ack_cm_event(cm_event);
> +
> +    /* resolve route */
> +    ret = rdma_resolve_route(rdma_ctx->cm_id, QEMU_RDMA_RESOLVE_TIMEOUT_MS);
> +    if (ret) {
> +        printf("could not resolve rdma route\n");
> +        goto err_resolve_get_addr;
> +    }
> +
> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> +    if (ret) {
> +        printf("could not perform event_route_resolved\n");
> +        goto err_resolve_get_addr;
> +    }
> +    if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
> +        printf("result not equal to event_route_resolved\n");
> +        rdma_ack_cm_event(cm_event);
> +        goto err_resolve_get_addr;
> +    }
> +    rdma_ack_cm_event(cm_event);
> +
> +    rdma_ctx->verbs = rdma_ctx->cm_id->verbs;
> +    return 0;
> +
> +err_resolve_get_addr:
> +    rdma_destroy_id(rdma_ctx->cm_id);
> +err_resolve_create_id:
> +    rdma_destroy_event_channel(rdma_ctx->channel);
> +    rdma_ctx->channel = NULL;
> +
> +    return -1;
> +}
> +
> +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx)
> +{
> +
> +    /* allocate pd */
> +    rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs);
> +    if (!rdma_ctx->pd) {
> +        return -1;
> +    }
> +
> +#ifdef QEMU_RDMA_MIGRATION_BLOCKING
> +    /* create completion channel */
> +    rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs);
> +    if (!rdma_ctx->comp_channel) {
> +        goto err_alloc_pd_cq;
> +    }
> +#endif
> +
> +    /* create cq */
> +    rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, QEMU_RDMA_CQ_SIZE,
> +            NULL, rdma_ctx->comp_channel, 0);
> +    if (!rdma_ctx->cq) {
> +        goto err_alloc_pd_cq;
> +    }
> +
> +    return 0;
> +
> +err_alloc_pd_cq:
> +    if (rdma_ctx->pd) {
> +        ibv_dealloc_pd(rdma_ctx->pd);
> +    }
> +    if (rdma_ctx->comp_channel) {
> +        ibv_destroy_comp_channel(rdma_ctx->comp_channel);
> +    }
> +    rdma_ctx->pd = NULL;
> +    rdma_ctx->comp_channel = NULL;
> +    return -1;
> +
> +}
> +
> +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx)
> +{
> +    struct ibv_qp_init_attr attr = { 0 };
> +    int ret;
> +
> +    attr.cap.max_send_wr = QEMU_RDMA_QP_SIZE;
> +    attr.cap.max_recv_wr = 2;
> +    attr.cap.max_send_sge = 1;
> +    attr.cap.max_recv_sge = 1;
> +    attr.send_cq = rdma_ctx->cq;
> +    attr.recv_cq = rdma_ctx->cq;
> +    attr.qp_type = IBV_QPT_RC;
> +
> +    ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr);
> +    if (ret) {
> +        return -1;
> +    }
> +
> +    rdma_ctx->qp = rdma_ctx->cm_id->qp;
> +    return 0;
> +}
> +
> +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx,
> +        void *in_data, int *in_len, void *out_data, int out_len)
> +{
> +    int ret;
> +    struct rdma_conn_param conn_param = { 0 };
> +    struct rdma_cm_event *cm_event;
> +
> +    conn_param.initiator_depth = 2;
> +    conn_param.retry_count = 5;
> +    conn_param.private_data = out_data;
> +    conn_param.private_data_len = out_len;
> +
> +    ret = rdma_connect(rdma_ctx->cm_id, &conn_param);
> +    if (ret) {
> +        return -1;
> +    }
> +
> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> +    if (ret) {
> +        return -1;
> +    }
> +    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
> +        return -1;
> +    }
> +
> +    if (in_len) {
> +        if (*in_len > cm_event->param.conn.private_data_len) {
> +            *in_len = cm_event->param.conn.private_data_len;
> +        }
> +        if (*in_len) {
> +            memcpy(in_data, cm_event->param.conn.private_data, *in_len);
> +        }
> +    }
> +
> +    rdma_ack_cm_event(cm_event);
> +
> +    return 0;
> +}
> +
> +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host,
> +        int port)
> +{
> +    int ret;
> +    struct rdma_cm_event *cm_event;
> +    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
> +    struct ibv_context *verbs;
> +
> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> +    if (ret) {
> +        goto err_listen;
> +    }
> +
> +    if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
> +        rdma_ack_cm_event(cm_event);
> +        goto err_listen;
> +    }
> +
> +    rdma_ctx->cm_id = cm_event->id;
> +    verbs = cm_event->id->verbs;
> +    printf("verbs context after listen: %p\n", verbs);
> +    rdma_ack_cm_event(cm_event);
> +
> +    if (!rdma_ctx->verbs) {
> +        rdma_ctx->verbs = verbs;
> +        ret = qemu_rdma_migration_server_prepare(mdata);
> +        if (ret) {
> +            fprintf(stderr, "rdma migration: error preparing server!\n");
> +            goto err_listen;
> +        }
> +    } else if (rdma_ctx->verbs != verbs) {
> +            fprintf(stderr, "ibv context not matching %p, %p!\n",
> +                    rdma_ctx->verbs, verbs);
> +            goto err_listen;
> +    }
> +    /* xxx destroy listen_id ??? */
> +
> +    return 0;
> +
> +err_listen:
> +
> +    return -1;
> +
> +}
> +
> +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx,
> +        void *in_data, int *in_len, void *out_data, int out_len)
> +{
> +    int ret;
> +    struct rdma_conn_param conn_param = { 0 };
> +    struct rdma_cm_event *cm_event;
> +
> +    conn_param.responder_resources = 2;
> +    conn_param.private_data = out_data;
> +    conn_param.private_data_len = out_len;
> +
> +    ret = rdma_accept(rdma_ctx->cm_id, &conn_param);
> +    if (ret) {
> +        fprintf(stderr, "rdma_accept returns %d!\n", ret);
> +        return -1;
> +    }
> +
> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> +    if (ret) {
> +        return -1;
> +    }
> +
> +    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
> +        rdma_ack_cm_event(cm_event);
> +        return -1;
> +    }
> +
> +    if (in_len) {
> +        if (*in_len > cm_event->param.conn.private_data_len) {
> +            *in_len = cm_event->param.conn.private_data_len;
> +        }
> +        if (*in_len) {
> +            memcpy(in_data, cm_event->param.conn.private_data, *in_len);
> +        }
> +    }
> +
> +    rdma_ack_cm_event(cm_event);
> +
> +    return 0;
> +}
> +
> +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx)
> +{
> +    int ret;
> +    struct rdma_cm_event *cm_event;
> +
> +    ret = rdma_disconnect(rdma_ctx->cm_id);
> +    if (ret) {
> +        return;
> +    }
> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> +    if (ret) {
> +        return;
> +    }
> +    rdma_ack_cm_event(cm_event);
> +}
> +
> +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx)
> +{
> +    if (rdma_ctx->qp) {
> +        ibv_destroy_qp(rdma_ctx->qp);
> +    }
> +    if (rdma_ctx->cq) {
> +        ibv_destroy_cq(rdma_ctx->cq);
> +    }
> +    if (rdma_ctx->comp_channel) {
> +        ibv_destroy_comp_channel(rdma_ctx->comp_channel);
> +    }
> +    if (rdma_ctx->pd) {
> +        ibv_dealloc_pd(rdma_ctx->pd);
> +    }
> +    if (rdma_ctx->listen_id) {
> +        rdma_destroy_id(rdma_ctx->listen_id);
> +    }
> +    if (rdma_ctx->cm_id) {
> +        rdma_destroy_id(rdma_ctx->cm_id);
> +    }
> +    if (rdma_ctx->channel) {
> +        rdma_destroy_event_channel(rdma_ctx->channel);
> +    }
> +}
> +
> +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks)
> +{
> +    int i, j;
> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> +        struct qemu_rdma_ram_block *block = &(rdma_ram_blocks->block[i]);
> +        int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block);
> +        /* allocate memory to store chunk MRs */
> +        rdma_ram_blocks->block[i].pmr = qemu_rdma_mallocz(
> +                                num_chunks * sizeof(struct ibv_mr *));
> +
> +        if (!block->pmr) {
> +            goto err_reg_chunk_ram_blocks;
> +        }
> +
> +        for (j = 0; j < num_chunks; j++) {
> +            uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, j);
> +            uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, j);
> +            if (start_addr < block->local_host_addr) {
> +                start_addr = block->local_host_addr;
> +            }
> +            if (end_addr > block->local_host_addr + block->length) {
> +                end_addr = block->local_host_addr + block->length;
> +            }
> +            block->pmr[j] = ibv_reg_mr(rdma_ctx->pd,
> +                                start_addr,
> +                                end_addr - start_addr,
> +                                IBV_ACCESS_LOCAL_WRITE |
> +                                IBV_ACCESS_REMOTE_WRITE |
> +                                IBV_ACCESS_REMOTE_READ);
> +            if (!block->pmr[j]) {
> +                break;
> +            }
> +        }
> +        if (j < num_chunks) {
> +            for (j--; j >= 0; j--) {
> +                ibv_dereg_mr(block->pmr[j]);
> +            }
> +            block->pmr[i] = NULL;
> +            goto err_reg_chunk_ram_blocks;
> +        }
> +    }
> +
> +    return 0;
> +
> +err_reg_chunk_ram_blocks:
> +    for (i--; i >= 0; i--) {
> +        int num_chunks =
> +            QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
> +        for (j = 0; j < num_chunks; j++) {
> +            ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
> +        }
> +        free(rdma_ram_blocks->block[i].pmr);
> +        rdma_ram_blocks->block[i].pmr = NULL;
> +    }
> +
> +    return -1;
> +
> +}
> +
> +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks)
> +{
> +    int i;
> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> +        rdma_ram_blocks->block[i].mr =
> +            ibv_reg_mr(rdma_ctx->pd,
> +                    rdma_ram_blocks->block[i].local_host_addr,
> +                    rdma_ram_blocks->block[i].length,
> +                    IBV_ACCESS_LOCAL_WRITE |
> +                    IBV_ACCESS_REMOTE_WRITE |
> +                    IBV_ACCESS_REMOTE_READ);
> +        if (!rdma_ram_blocks->block[i].mr) {
> +                break;
> +        }
> +    }
> +
> +    if (i >= rdma_ram_blocks->num_blocks) {
> +        return 0;
> +    }
> +
> +    for (i--; i >= 0; i--) {
> +        ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
> +    }
> +
> +    return -1;
> +
> +}
> +
> +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +                struct qemu_rdma_ram_blocks *rdma_ram_blocks)
> +{
> +#ifdef QEMU_RDMA_CHUNK_REGISTRATION
> +#ifdef QEMU_RDMA_LAZY_REGISTRATION
> +    return 0;
> +#else
> +    return qemu_rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks);
> +#endif
> +#else
> +    return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
> +#endif
> +}
> +
> +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
> +                struct qemu_rdma_ram_blocks *rdma_ram_blocks)
> +{
> +    return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
> +}
> +
> +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks)
> +{
> +    int i, j;
> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> +        int num_chunks;
> +        if (!rdma_ram_blocks->block[i].pmr) {
> +            continue;
> +        }
> +        num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
> +        for (j = 0; j < num_chunks; j++) {
> +            if (!rdma_ram_blocks->block[i].pmr[j]) {
> +                continue;
> +            }
> +            ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
> +        }
> +        free(rdma_ram_blocks->block[i].pmr);
> +        rdma_ram_blocks->block[i].pmr = NULL;
> +    }
> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> +        if (!rdma_ram_blocks->block[i].mr) {
> +            continue;
> +        }
> +        ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
> +        rdma_ram_blocks->block[i].mr = NULL;
> +    }
> +}
> +
> +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
> +                struct qemu_rdma_remote_ram_blocks *remote)
> +{
> +    int i;
> +    remote->num_blocks = local->num_blocks;
> +    for (i = 0; i < local->num_blocks; i++) {
> +            remote->block[i].remote_host_addr =
> +                (uint64_t)(local->block[i].local_host_addr);
> +            remote->block[i].remote_rkey = local->block[i].mr->rkey;
> +            remote->block[i].offset = local->block[i].offset;
> +            remote->block[i].length = local->block[i].length;
> +    }
> +}
> +
> +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
> +                struct qemu_rdma_remote_ram_blocks *remote)
> +{
> +    int i, j;
> +
> +    if (local->num_blocks != remote->num_blocks) {
> +        return -1;
> +    }
> +
> +    for (i = 0; i < remote->num_blocks; i++) {
> +        /* search local ram blocks */
> +        for (j = 0; j < local->num_blocks; j++) {
> +            if (remote->block[i].offset != local->block[j].offset) {
> +                continue;
> +            }
> +            if (remote->block[i].length != local->block[j].length) {
> +                return -1;
> +            }
> +            local->block[j].remote_host_addr =
> +                remote->block[i].remote_host_addr;
> +            local->block[j].remote_rkey = remote->block[i].remote_rkey;
> +            break;
> +        }
> +        if (j >= local->num_blocks) {
> +            return -1;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length,
> +                struct qemu_rdma_ram_blocks *blocks,
> +                int *block_index, int *chunk_index)
> +{
> +    int i;
> +    for (i = 0; i < blocks->num_blocks; i++) {
> +        if (offset < blocks->block[i].offset) {
> +            continue;
> +        }
> +        if (offset + length >
> +                blocks->block[i].offset + blocks->block[i].length) {
> +            continue;
> +        }
> +        *block_index = i;
> +        if (chunk_index) {
> +            uint8_t *host_addr = blocks->block[i].local_host_addr +
> +                (offset - blocks->block[i].offset);
> +            *chunk_index = QEMU_RDMA_REG_CHUNK_INDEX(
> +                    blocks->block[i].local_host_addr, host_addr);
> +        }
> +        return 0;
> +    }
> +    return -1;
> +}
> +
> +static int qemu_rdma_get_lkey(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_block *block, uint64_t host_addr,
> +        uint32_t *lkey)
> +{
> +    int chunk;
> +    if (block->mr) {
> +        *lkey = block->mr->lkey;
> +        return 0;
> +    }
> +    if (!block->pmr) {
> +        int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block);
> +        /* allocate memory to store chunk MRs */
> +        block->pmr = qemu_rdma_mallocz(num_chunks *
> +                sizeof(struct ibv_mr *));
> +        if (!block->pmr) {
> +            return -1;
> +        }
> +    }
> +    chunk = QEMU_RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr);
> +    if (!block->pmr[chunk]) {
> +        uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, chunk);
> +        uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, chunk);
> +        if (start_addr < block->local_host_addr) {
> +            start_addr = block->local_host_addr;
> +        }
> +        if (end_addr > block->local_host_addr + block->length) {
> +            end_addr = block->local_host_addr + block->length;
> +        }
> +        block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd,
> +                start_addr,
> +                end_addr - start_addr,
> +                IBV_ACCESS_LOCAL_WRITE |
> +                IBV_ACCESS_REMOTE_WRITE |
> +                IBV_ACCESS_REMOTE_READ);
> +        if (!block->pmr[chunk]) {
> +            return -1;
> +        }
> +    }
> +    *lkey = block->pmr[chunk]->lkey;
> +    return 0;
> +}
> +
> +static int qemu_rdma_write(struct qemu_rdma_context *rdma_ctx,
> +        struct qemu_rdma_ram_block *block,
> +        uint64_t offset, uint64_t length,
> +        uint64_t wr_id, enum ibv_send_flags flag)
> +{
> +    struct ibv_sge sge;
> +    struct ibv_send_wr send_wr = { 0 };
> +    struct ibv_send_wr *bad_wr;
> +
> +    sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset));
> +    sge.length = length;
> +    if (qemu_rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) {
> +        fprintf(stderr, "cannot get lkey!\n");
> +        return -1;
> +    }
> +    send_wr.wr_id = wr_id;
> +    send_wr.opcode = IBV_WR_RDMA_WRITE;
> +    send_wr.send_flags = flag;
> +    send_wr.sg_list = &sge;
> +    send_wr.num_sge = 1;
> +    send_wr.wr.rdma.rkey = block->remote_rkey;
> +    send_wr.wr.rdma.remote_addr = block->remote_host_addr +
> +        (offset - block->offset);
> +
> +    if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/* Do not merge data if larger than this. */
> +#define QEMU_RDMA_MIGRATION_MERGE_MAX (4 * 1024 * 1024)
> +
> +#define QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX 64
> +
> +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata)
> +{
> +    mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
> +            &mdata->sync,
> +            sizeof mdata->sync,
> +            IBV_ACCESS_LOCAL_WRITE |
> +            IBV_ACCESS_REMOTE_WRITE |
> +            IBV_ACCESS_REMOTE_READ);
> +    if (mdata->sync_mr) {
> +        return 0;
> +    }
> +    return -1;
> +}
> +
> +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata)
> +{
> +    return ibv_dereg_mr(mdata->sync_mr);
> +}
> +
> +int qemu_rdma_migration_reg_remote_info(
> +        struct qemu_rdma_migration_data *mdata)
> +{
> +    mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
> +            &mdata->remote_info,
> +            sizeof mdata->remote_info,
> +            IBV_ACCESS_LOCAL_WRITE |
> +            IBV_ACCESS_REMOTE_WRITE |
> +            IBV_ACCESS_REMOTE_READ);
> +    if (mdata->remote_info_mr) {
> +        return 0;
> +    }
> +    return -1;
> +}
> +
> +int qemu_rdma_migration_dereg_remote_info(
> +        struct qemu_rdma_migration_data *mdata)
> +{
> +    return ibv_dereg_mr(mdata->remote_info_mr);
> +}
> +
> +
> +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata,
> +        int wr_id)
> +{
> +    struct ibv_sge sge;
> +    struct ibv_send_wr send_wr = { 0 };
> +    struct ibv_send_wr *bad_wr;
> +
> +    mdata->sync = 1;
> +
> +    sge.addr = (uint64_t)(&mdata->sync);
> +    sge.length = sizeof mdata->sync;
> +    sge.lkey = mdata->sync_mr->lkey;
> +
> +    send_wr.wr_id = wr_id;
> +    send_wr.opcode = IBV_WR_SEND;
> +    send_wr.send_flags = IBV_SEND_SIGNALED;
> +    send_wr.sg_list = &sge;
> +    send_wr.num_sge = 1;
> +
> +    if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata,
> +        int wr_id)
> +{
> +    struct ibv_sge sge;
> +    struct ibv_recv_wr recv_wr = { 0 };
> +    struct ibv_recv_wr *bad_wr;
> +
> +    mdata->sync = 1;
> +
> +    sge.addr = (uint64_t)(&mdata->sync);
> +    sge.length = sizeof mdata->sync;
> +    sge.lkey = mdata->sync_mr->lkey;
> +
> +    recv_wr.wr_id = wr_id;
> +    recv_wr.sg_list = &sge;
> +    recv_wr.num_sge = 1;
> +
> +    if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +
> +}
> +
> +int qemu_rdma_migration_post_send_remote_info(
> +        struct qemu_rdma_migration_data *mdata)
> +{
> +    struct ibv_sge sge;
> +    struct ibv_send_wr send_wr = { 0 };
> +    struct ibv_send_wr *bad_wr;
> +
> +    sge.addr = (uint64_t)(&mdata->remote_info);
> +    sge.length = sizeof mdata->remote_info;
> +    sge.lkey = mdata->remote_info_mr->lkey;
> +
> +    send_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO;
> +    send_wr.opcode = IBV_WR_SEND;
> +    send_wr.send_flags = IBV_SEND_SIGNALED;
> +    send_wr.sg_list = &sge;
> +    send_wr.num_sge = 1;
> +
> +    if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
> +        return -1;
> +    }
> +
> +    mdata->num_signaled_send--;
> +    return 0;
> +}
> +
> +int qemu_rdma_migration_post_recv_remote_info(
> +        struct qemu_rdma_migration_data *mdata)
> +{
> +    struct ibv_sge sge;
> +    struct ibv_recv_wr recv_wr = { 0 };
> +    struct ibv_recv_wr *bad_wr;
> +
> +    sge.addr = (uint64_t)(&mdata->remote_info);
> +    sge.length = sizeof mdata->remote_info;
> +    sge.lkey = mdata->remote_info_mr->lkey;
> +
> +    recv_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO;
> +    recv_wr.sg_list = &sge;
> +    recv_wr.num_sge = 1;
> +
> +    if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +
> +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata)
> +{
> +    int ret;
> +    enum ibv_send_flags flags = 0;
> +
> +    if (!mdata->current_length) {
> +        return 0;
> +    }
> +    if (mdata->num_unsignaled_send >=
> +            QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) {
> +        flags = IBV_SEND_SIGNALED;
> +    }
> +    ret = qemu_rdma_write(&mdata->rdma_ctx,
> +            &(mdata->rdma_ram_blocks.block[mdata->current_index]),
> +            mdata->current_offset,
> +            mdata->current_length,
> +            QEMU_RDMA_MIGRATION_WRID_RDMA, flags);
> +
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    if (mdata->num_unsignaled_send >=
> +            QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) {
> +        mdata->num_unsignaled_send = 0;
> +        mdata->num_signaled_send++;
> +    } else {
> +        mdata->num_unsignaled_send++;
> +    }
> +
> +    mdata->total_bytes += mdata->current_length;
> +    mdata->current_length = 0;
> +    mdata->current_offset = 0;
> +
> +    return 0;
> +}
> +
> +static inline int qemu_rdma_migration_in_current_block(
> +        struct qemu_rdma_migration_data *mdata,
> +        uint64_t offset, uint64_t len)
> +{
> +    struct qemu_rdma_ram_block *block =
> +        &(mdata->rdma_ram_blocks.block[mdata->current_index]);
> +    if (mdata->current_index < 0) {
> +        return 0;
> +    }
> +    if (offset < block->offset) {
> +        return 0;
> +    }
> +    if (offset + len > block->offset + block->length) {
> +        return 0;
> +    }
> +    return 1;
> +}
> +
> +static inline int qemu_rdma_migration_in_current_chunk(
> +        struct qemu_rdma_migration_data *mdata,
> +        uint64_t offset, uint64_t len)
> +{
> +    struct qemu_rdma_ram_block *block =
> +        &(mdata->rdma_ram_blocks.block[mdata->current_index]);
> +    uint8_t *chunk_start, *chunk_end, *host_addr;
> +    if (mdata->current_chunk < 0) {
> +        return 0;
> +    }
> +    host_addr = block->local_host_addr + (offset - block->offset);
> +    chunk_start = QEMU_RDMA_REG_CHUNK_START(block, mdata->current_chunk);
> +    if (chunk_start < block->local_host_addr) {
> +        chunk_start = block->local_host_addr;
> +    }
> +    if (host_addr < chunk_start) {
> +        return 0;
> +    }
> +    chunk_end = QEMU_RDMA_REG_CHUNK_END(block, mdata->current_chunk);
> +    if (chunk_end > chunk_start + block->length) {
> +        chunk_end = chunk_start + block->length;
> +    }
> +    if (host_addr + len > chunk_end) {
> +        return 0;
> +    }
> +    return 1;
> +}
> +
> +static inline int qemu_rdma_buffer_mergable(
> +        struct qemu_rdma_migration_data *mdata,
> +        uint64_t offset, uint64_t len)
> +{
> +    if (mdata->current_length == 0) {
> +        return 0;
> +    }
> +    if (offset != mdata->current_offset + mdata->current_length) {
> +        return 0;
> +    }
> +    if (!qemu_rdma_migration_in_current_block(mdata, offset, len)) {
> +        return 0;
> +    }
> +#ifdef QEMU_RDMA_CHUNK_REGISTRATION
> +    if (!qemu_rdma_migration_in_current_chunk(mdata, offset, len)) {
> +        return 0;
> +    }
> +#endif
> +    return 1;
> +}
> +
> +/* Note that buffer must be within a single block/chunk. */
> +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata,
> +                uint64_t offset, uint64_t len)
> +{
> +    int index = mdata->current_index;
> +    int chunk_index = mdata->current_chunk;
> +    int ret;
> +
> +    /* If we cannot merge it, we flush the current buffer first. */
> +    if (!qemu_rdma_buffer_mergable(mdata, offset, len)) {
> +        ret = qemu_rdma_migration_write_flush(mdata);
> +        if (ret) {
> +            return ret;
> +        }
> +        mdata->current_length = 0;
> +        mdata->current_offset = offset;
> +
> +        if (qemu_rdma_search_ram_block(offset, len,
> +                    &mdata->rdma_ram_blocks, &index, &chunk_index)) {
> +            return -1;
> +        }
> +        mdata->current_index = index;
> +        mdata->current_chunk = chunk_index;
> +    }
> +
> +    /* merge it */
> +    mdata->current_length += len;
> +
> +    /* flush it if buffer is too large */
> +    if (mdata->current_length >= QEMU_RDMA_MIGRATION_MERGE_MAX) {
> +        return qemu_rdma_migration_write_flush(mdata);
> +    }
> +
> +    return 0;
> +}
> +
> +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata)
> +{
> +    int ret;
> +    struct ibv_wc wc;
> +
> +    ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc);
> +    if (!ret) {
> +        return QEMU_RDMA_MIGRATION_WRID_NONE;
> +    }
> +    if (ret < 0) {
> +        fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
> +        return ret;
> +    }
> +    if (wc.status != IBV_WC_SUCCESS) {
> +        fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
> +                        wc.status, ibv_wc_status_str(wc.status));
> +        fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id);
> +
> +        return -1;
> +    }
> +
> +    if (!(wc.opcode & IBV_WC_RECV)) {
> +        mdata->num_signaled_send--;
> +    }
> +
> +    return  (int)wc.wr_id;
> +}
> +
> +int qemu_rdma_migration_wait_for_wrid(
> +        struct qemu_rdma_migration_data *mdata,
> +        int wrid)
> +{
> +#ifdef QEMU_RDMA_MIGRATION_BLOCKING
> +    return qemu_rdma_migration_block_for_wrid(mdata, wrid);
> +#else
> +    return qemu_rdma_migration_poll_for_wrid(mdata, wrid);
> +#endif
> +}
> +
> +int qemu_rdma_migration_poll_for_wrid(
> +        struct qemu_rdma_migration_data *mdata,
> +        int wrid)
> +{
> +    int r = QEMU_RDMA_MIGRATION_WRID_NONE;
> +    while (r != wrid) {
> +        r = qemu_rdma_migration_poll(mdata);
> +        if (r < 0) {
> +            return r;
> +        }
> +    }
> +    return 0;
> +}
> +
> +int qemu_rdma_migration_block_for_wrid(
> +        struct qemu_rdma_migration_data *mdata,
> +        int wrid)
> +{
> +    int num_cq_events = 0;
> +    int r = QEMU_RDMA_MIGRATION_WRID_NONE;
> +    struct ibv_cq *cq;
> +    void *cq_ctx;
> +
> +    if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) {
> +        return -1;
> +    }
> +    /* poll cq first */
> +    while (r != wrid) {
> +        r = qemu_rdma_migration_poll(mdata);
> +        if (r < 0) {
> +            return r;
> +        }
> +        if (r == QEMU_RDMA_MIGRATION_WRID_NONE) {
> +            break;
> +        }
> +    }
> +    if (r == wrid) {
> +        return 0;
> +    }
> +
> +    while (1) {
> +        if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel,
> +                    &cq, &cq_ctx)) {
> +            goto err_block_for_wrid;
> +        }
> +        num_cq_events++;
> +        if (ibv_req_notify_cq(cq, 0)) {
> +            goto err_block_for_wrid;
> +        }
> +        /* poll cq */
> +        while (r != wrid) {
> +            r = qemu_rdma_migration_poll(mdata);
> +            if (r < 0) {
> +                goto err_block_for_wrid;
> +            }
> +            if (r == QEMU_RDMA_MIGRATION_WRID_NONE) {
> +                break;
> +            }
> +        }
> +        if (r == wrid) {
> +            goto success_block_for_wrid;
> +        }
> +    }
> +
> +success_block_for_wrid:
> +    if (num_cq_events) {
> +        ibv_ack_cq_events(cq, num_cq_events);
> +    }
> +    return 0;
> +
> +err_block_for_wrid:
> +    if (num_cq_events) {
> +        ibv_ack_cq_events(cq, num_cq_events);
> +    }
> +    return -1;
> +}
> +
> +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata)
> +{
> +    mdata->enabled = 0;
> +    if (mdata->sync_mr) {
> +        qemu_rdma_migration_dereg_sync(mdata);
> +    }
> +    if (mdata->remote_info_mr) {
> +        qemu_rdma_migration_dereg_remote_info(mdata);
> +    }
> +    mdata->sync_mr = NULL;
> +    mdata->remote_info_mr = NULL;
> +    qemu_rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks);
> +    mdata->rdma_ram_blocks.num_blocks = 0;
> +    qemu_rdma_cleanup(&mdata->rdma_ctx);
> +    qemu_rdma_migration_data_init(mdata);
> +}
> +
> +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata)
> +{
> +    int ret;
> +
> +    if (mdata->client_init_done) {
> +        return 0;
> +    }
> +
> +    ret = qemu_rdma_resolve_host(&mdata->rdma_ctx,
> +            mdata->host, mdata->port);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error resolving host!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_alloc_pd_cq(&mdata->rdma_ctx);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error allocating qp!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_client_reg_ram_blocks(&mdata->rdma_ctx,
> +            &mdata->rdma_ram_blocks);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error registering ram blocks!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_migration_reg_sync(mdata);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error registering sync data!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_migration_reg_remote_info(mdata);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error registering remote info!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    ret = qemu_rdma_migration_post_recv_remote_info(mdata);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error posting remote info recv!\n");
> +        goto err_rdma_client_init;
> +    }
> +
> +    mdata->client_init_done = 1;
> +    return 0;
> +
> +err_rdma_client_init:
> +    qemu_rdma_migration_cleanup(mdata);
> +    return -1;
> +}
> +
> +
> +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata)
> +{
> +
> +    int ret;
> +
> +    ret = qemu_rdma_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error connecting!\n");
> +        goto err_rdma_client_connect;
> +    }
> +
> +    /* wait for remote info */
> +    ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> +            QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO);
> +    if (ret < 0) {
> +        fprintf(stderr, "rdma migration: polling remote info error!\n");
> +        goto err_rdma_client_connect;
> +    }
> +
> +    ret = qemu_rdma_process_remote_ram_blocks(
> +            &mdata->rdma_ram_blocks, &mdata->remote_info);
> +    if (ret) {
> +        fprintf(stderr,
> +                "rdma migration: error processing remote ram blocks!\n");
> +        goto err_rdma_client_connect;
> +    }
> +
> +    rdma_mdata.total_bytes = 0;
> +    rdma_mdata.enabled = 1;
> +    return 0;
> +
> +err_rdma_client_connect:
> +    qemu_rdma_migration_cleanup(mdata);
> +    return -1;
> +}
> +
> +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata)
> +{
> +
> +    int ret;
> +    struct sockaddr_in sin;
> +    struct rdma_cm_id *listen_id;
> +    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
> +
> +    /* create CM channel */
> +    rdma_ctx->channel = rdma_create_event_channel();
> +    if (!rdma_ctx->channel) {
> +        return -1;
> +    }
> +
> +    /* create CM id */
> +    ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL,
> +            RDMA_PS_TCP);
> +    if (ret) {
> +        goto err_server_init_create_listen_id;
> +    }
> +
> +    memset(&sin, 0, sizeof(sin));
> +    sin.sin_family = AF_INET;
> +    sin.sin_port = htons(mdata->port);
> +    if (strcmp("", mdata->host)) {
> +        struct hostent *server_addr;
> +        server_addr = gethostbyname(mdata->host);
> +        if (!server_addr) {
> +            goto err_server_init_bind_addr;
> +        }
> +        memcpy(&sin.sin_addr.s_addr, server_addr->h_addr,
> +                server_addr->h_length);
> +    } else {
> +        sin.sin_addr.s_addr = INADDR_ANY;
> +    }
> +
> +    ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin);
> +    if (ret) {
> +        goto err_server_init_bind_addr;
> +    }
> +    printf("verbs context after binding: %p\n", listen_id->verbs);
> +
> +    rdma_ctx->listen_id = listen_id;
> +    if (listen_id->verbs) {
> +        rdma_ctx->verbs = listen_id->verbs;
> +    }
> +    return 0;
> +
> +err_server_init_bind_addr:
> +    rdma_destroy_id(listen_id);
> +err_server_init_create_listen_id:
> +    rdma_destroy_event_channel(rdma_ctx->channel);
> +    rdma_ctx->channel = NULL;
> +
> +    return -1;
> +
> +}
> +
> +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata)
> +{
> +    int ret;
> +    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
> +
> +    if (!rdma_ctx->verbs) {
> +        return 0;
> +    }
> +
> +    ret = qemu_rdma_alloc_pd_cq(rdma_ctx);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
> +        goto err_rdma_server_prepare;
> +    }
> +
> +    ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
> +        goto err_rdma_server_prepare;
> +    }
> +
> +    ret = qemu_rdma_server_reg_ram_blocks(rdma_ctx,
> +            &mdata->rdma_ram_blocks);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error registering ram blocks!\n");
> +        goto err_rdma_server_prepare;
> +    }
> +
> +    ret = qemu_rdma_migration_reg_sync(mdata);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error registering sync data!\n");
> +        goto err_rdma_server_prepare;
> +    }
> +
> +    qemu_rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks,
> +            &mdata->remote_info);
> +
> +    ret = qemu_rdma_migration_reg_remote_info(mdata);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error registering remote info!\n");
> +        goto err_rdma_server_prepare;
> +    }
> +
> +    ret = rdma_listen(rdma_ctx->listen_id, 5);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error listening on socket!\n");
> +        goto err_rdma_server_prepare;
> +    }
> +
> +    return 0;
> +
> +err_rdma_server_prepare:
> +    qemu_rdma_migration_cleanup(mdata);
> +    return -1;
> +}
> +
> +int qemu_rdma_migration_server_wait_for_client(
> +                                    struct qemu_rdma_migration_data *mdata)
> +{
> +
> +    int ret;
> +
> +    ret = qemu_rdma_listen(mdata, mdata->host, mdata->port);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error listening!\n");
> +        goto err_rdma_server_wait;
> +    }
> +
> +    ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error allocating qp!\n");
> +        goto err_rdma_server_wait;
> +    }
> +
> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
> +    ret = qemu_rdma_migration_post_recv_sync(mdata,
> +            QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error posting extra sync receive!\n");
> +        goto err_rdma_server_wait;
> +    }
> +#endif
> +
> +    ret = qemu_rdma_migration_post_recv_sync(mdata,
> +            QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error posting sync receive!\n");
> +        goto err_rdma_server_wait;
> +    }
> +
> +    ret = qemu_rdma_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error accepting connection!\n");
> +        goto err_rdma_server_wait;
> +    }
> +
> +    /* send remote info */
> +    ret = qemu_rdma_migration_post_send_remote_info(mdata);
> +    if (ret) {
> +        fprintf(stderr, "rdma migration: error sending remote info!\n");
> +        goto err_rdma_server_wait;
> +    }
> +
> +    /* wait for completion */
> +    ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> +            QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO);
> +    if (ret < 0) {
> +        fprintf(stderr, "rdma migration: polling remote info error!\n");
> +        goto err_rdma_server_wait;
> +    }
> +
> +    rdma_mdata.total_bytes = 0;
> +    rdma_mdata.enabled = 1;
> +    return 0;
> +
> +err_rdma_server_wait:
> +    qemu_rdma_migration_cleanup(mdata);
> +    return -1;
> +
> +}
> +
> +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata)
> +{
> +    qemu_rdma_init_context(&mdata->rdma_ctx);
> +    mdata->port = rdmaport;
> +    mdata->host = rdmahost;
> +    mdata->enabled = 0;
> +    mdata->rdma_ram_blocks.num_blocks = 0;
> +    mdata->client_init_done = 0;
> +    mdata->num_unsignaled_send = 0;
> +    mdata->num_signaled_send = 0;
> +    mdata->current_offset = 0;
> +    mdata->current_length = 0;
> +    mdata->current_index = -1;
> +    mdata->current_chunk = -1;
> +    mdata->sync = 0;
> +    mdata->sync_mr = NULL;
> +    mdata->remote_info_mr = NULL;
> +}
> +
> +void qemu_rdma_migration_disable(void)
> +{
> +    rdma_mdata.port = -1;
> +    rdma_mdata.enabled = 0;
> +}
>
mrhines@linux.vnet.ibm.com Feb. 19, 2013, 1:35 a.m. UTC | #5
Acknowledged.

On 02/18/2013 06:02 AM, Paolo Bonzini wrote:
> Il 28/01/2013 23:01, mrhines@linux.vnet.ibm.com ha scritto:
>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>
>>
>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>> ---
>>   Makefile.target     |    5 +-
>>   include/qemu/rdma.h |  249 ++++++++++
> Please make this include/migration/rdma.h
>
> Paolo
>
>>   qemu-rdma.c         | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 1609 insertions(+), 2 deletions(-)
>>   create mode 100644 include/qemu/rdma.h
>>   create mode 100644 qemu-rdma.c
>>
>> diff --git a/Makefile.target b/Makefile.target
>> index 760da1e..d1d6b8c 100644
>> --- a/Makefile.target
>> +++ b/Makefile.target
>> @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o
>>   obj-y += hw/
>>   obj-$(CONFIG_KVM) += kvm-all.o
>>   obj-$(CONFIG_NO_KVM) += kvm-stub.o
>> -obj-y += memory.o savevm.o cputlb.o
>> +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o
>> +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o
>>   obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o
>>   obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o
>>   obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o
>>   obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o
>> -LIBS+=-lz
>> +LIBS+=-lz -lrdmacm
>>   
>>   # xen support
>>   obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o
>> diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h
>> new file mode 100644
>> index 0000000..099622e
>> --- /dev/null
>> +++ b/include/qemu/rdma.h
>> @@ -0,0 +1,249 @@
>> +/*
>> + * RDMA data structures and helper functions header (for migration)
>> + *
>> + * Copyright IBM, Corp. 2013
>> + *
>> + * Authors:
>> + *  Michael R. Hines <mrhines@us.ibm.com>
>> + *  Jiuxing Liu <jl@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#ifndef _QEMU_RDMA_H
>> +#define _QEMU_RDMA_H
>> +
>> +#include <rdma/rdma_cma.h>
>> +#include "monitor/monitor.h"
>> +
>> +extern int rdmaport;
>> +extern char rdmahost[64];
>> +
>> +struct qemu_rdma_context {
>> +    /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in
>> +       cm_id->verbs, cm_id->channel, and cm_id->qp. */
>> +    struct rdma_cm_id *cm_id;
>> +    struct rdma_cm_id *listen_id;
>> +
>> +    struct ibv_context *verbs;
>> +    struct rdma_event_channel *channel;
>> +    struct ibv_qp *qp;
>> +
>> +    struct ibv_comp_channel *comp_channel;
>> +    struct ibv_pd *pd;
>> +    struct ibv_cq *cq;
>> +};
>> +
>> +static inline void qemu_rdma_init_context(struct qemu_rdma_context *rdma_ctx)
>> +{
>> +    rdma_ctx->cm_id = NULL;
>> +    rdma_ctx->listen_id = NULL;
>> +    rdma_ctx->verbs = NULL;
>> +    rdma_ctx->channel = NULL;
>> +    rdma_ctx->qp = NULL;
>> +    rdma_ctx->comp_channel = NULL;
>> +    rdma_ctx->pd = NULL;
>> +    rdma_ctx->cq = NULL;
>> +}
>> +
>> +void cpu_physical_memory_reset_dirty_all(void);
>> +
>> +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx,
>> +        const char *host, int port);
>> +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx);
>> +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx);
>> +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx,
>> +        void *in_data, int *in_len, void *out_data, int out_len);
>> +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx,
>> +        void *in_data, int *in_len, void *out_data, int out_len);
>> +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx);
>> +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx);
>> +
>> +/* Instead of registering whole ram blocks, we can register them in smaller
>> + * chunks. This may be benefial if the ram blocks have holes in them */
>> +#define QEMU_RDMA_CHUNK_REGISTRATION
>> +
>> +#define QEMU_RDMA_LAZY_REGISTRATION
>> +
>> +#define QEMU_RDMA_REG_CHUNK_SHIFT 20
>> +#define QEMU_RDMA_REG_CHUNK_SIZE (1UL << (QEMU_RDMA_REG_CHUNK_SHIFT))
>> +#define QEMU_RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \
>> +            (((unsigned long)(host_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT) - \
>> +            ((unsigned long)(start_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT))
>> +#define QEMU_RDMA_REG_NUM_CHUNKS(rdma_ram_block) \
>> +            (QEMU_RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\
>> +                (rdma_ram_block)->local_host_addr +\
>> +                (rdma_ram_block)->length) + 1)
>> +#define QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\
>> +            ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \
>> +                QEMU_RDMA_REG_CHUNK_SHIFT) + (i)) << \
>> +                QEMU_RDMA_REG_CHUNK_SHIFT))
>> +#define QEMU_RDMA_REG_CHUNK_END(rdma_ram_block, i) \
>> +            (QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) + \
>> +             QEMU_RDMA_REG_CHUNK_SIZE)
>> +
>> +struct qemu_rdma_ram_block {
>> +    uint8_t *local_host_addr;
>> +    uint64_t remote_host_addr;
>> +    uint64_t offset;
>> +    uint64_t length;
>> +    struct ibv_mr **pmr;
>> +    struct ibv_mr *mr;
>> +    uint32_t remote_rkey;
>> +};
>> +
>> +struct qemu_rdma_remote_ram_block {
>> +    uint64_t remote_host_addr;
>> +    uint64_t offset;
>> +    uint64_t length;
>> +    uint32_t remote_rkey;
>> +};
>> +
>> +#define QEMU_MAX_RAM_BLOCKS 64
>> +
>> +struct qemu_rdma_ram_blocks {
>> +    int num_blocks;
>> +    struct qemu_rdma_ram_block block[QEMU_MAX_RAM_BLOCKS];
>> +};
>> +
>> +struct qemu_rdma_remote_ram_blocks {
>> +    int num_blocks;
>> +    struct qemu_rdma_remote_ram_block block[QEMU_MAX_RAM_BLOCKS];
>> +};
>> +
>> +int qemu_rdma_init_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks);
>> +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
>> +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
>> +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
>> +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
>> +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks);
>> +
>> +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
>> +        struct qemu_rdma_remote_ram_blocks *remote);
>> +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
>> +        struct qemu_rdma_remote_ram_blocks *remote);
>> +
>> +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length,
>> +        struct qemu_rdma_ram_blocks *blocks,
>> +        int *block_index, int *chunk_index);
>> +
>> +struct qemu_rdma_migration_data {
>> +    char *host;
>> +    int port;
>> +    int enabled;
>> +
>> +    struct qemu_rdma_context rdma_ctx;
>> +    struct qemu_rdma_ram_blocks rdma_ram_blocks;
>> +
>> +    /* This is used for synchronization: We use
>> +       IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs
>> +       are done. When the receiver gets it, it can be certain
>> +       that all the RDMAs are completed. */
>> +    int sync;
>> +    struct ibv_mr *sync_mr;
>> +
>> +    /* This is used for the server to write the remote
>> +       ram blocks info. */
>> +    struct qemu_rdma_remote_ram_blocks remote_info;
>> +    struct ibv_mr *remote_info_mr;
>> +
>> +    /* The rest is only for the initiator of the migration. */
>> +    int client_init_done;
>> +
>> +    /* number of outstanding unsignaled send */
>> +    int num_unsignaled_send;
>> +
>> +    /* number of outstanding signaled send */
>> +    int num_signaled_send;
>> +
>> +    /* store info about current buffer so that we can
>> +       merge it with future sends */
>> +    uint64_t current_offset;
>> +    uint64_t current_length;
>> +    /* index of ram block the current buffer belongs to */
>> +    int current_index;
>> +    /* index of the chunk in the current ram block */
>> +    int current_chunk;
>> +
>> +    uint64_t total_bytes;
>> +
>> +};
>> +
>> +extern struct qemu_rdma_migration_data rdma_mdata;
>> +
>> +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata);
>> +
>> +static inline int qemu_use_rdma_migration(void)
>> +{
>> +    /* port will be non-zero if user wants to use RDMA. */
>> +    return rdma_mdata.port != -1;
>> +}
>> +
>> +static inline int qemu_rdma_migration_enabled(void)
>> +{
>> +    return rdma_mdata.enabled;
>> +}
>> +
>> +void qemu_rdma_migration_disable(void);
>> +
>> +#define QEMU_RDMA_MIGRATION_BLOCKING
>> +#define QEMU_RDMA_MIGRATION_EXTRA_SYNC
>> +
>> +enum {
>> +    QEMU_RDMA_MIGRATION_WRID_NONE = 0,
>> +    QEMU_RDMA_MIGRATION_WRID_RDMA,
>> +    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC,
>> +    QEMU_RDMA_MIGRATION_WRID_RECV_SYNC,
>> +    QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO,
>> +    QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO,
>> +    QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC,
>> +    QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC,
>> +};
>> +
>> +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host,
>> +                                                              int port);
>> +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata,
>> +        int wr_id);
>> +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata,
>> +        int wr_id);
>> +
>> +int qemu_rdma_migration_reg_remote_info(
>> +        struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_dereg_remote_info(
>> +        struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_post_send_remote_info(
>> +        struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_post_recv_remote_info(
>> +        struct qemu_rdma_migration_data *mdata);
>> +
>> +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata,
>> +        uint64_t addr, uint64_t len);
>> +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_wait_for_wrid(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        int wrid);
>> +int qemu_rdma_migration_poll_for_wrid(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        int wrid);
>> +int qemu_rdma_migration_block_for_wrid(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        int wrid);
>> +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata);
>> +
>> +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata);
>> +int qemu_rdma_migration_server_wait_for_client(
>> +                                       struct qemu_rdma_migration_data *mdata);
>> +
>> +#endif
>> diff --git a/qemu-rdma.c b/qemu-rdma.c
>> new file mode 100644
>> index 0000000..5f16875
>> --- /dev/null
>> +++ b/qemu-rdma.c
>> @@ -0,0 +1,1357 @@
>> +/*
>> + * RDMA data structures and helper functions (for migration)
>> + *
>> + * Copyright IBM, Corp. 2013
>> + *
>> + * Authors:
>> + *  Michael R. Hines <mrhines@us.ibm.com>
>> + *  Jiuxing Liu <jl@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + *
>> + */
>> +
>> +#include "qemu/rdma.h"
>> +#include "qemu-common.h"
>> +#include <stdio.h>
>> +#include <sys/types.h>
>> +#include <sys/socket.h>
>> +#include <netdb.h>
>> +#include <arpa/inet.h>
>> +#include <string.h>
>> +
>> +#define QEMU_RDMA_RESOLVE_TIMEOUT_MS 10000
>> +#define QEMU_RDMA_CQ_SIZE 2000
>> +#define QEMU_RDMA_QP_SIZE 1000
>> +
>> +int rdmaport = -1;
>> +char rdmahost[64] = "";
>> +struct qemu_rdma_migration_data rdma_mdata;
>> +
>> +static void *qemu_rdma_mallocz(size_t size)
>> +{
>> +    void *ptr;
>> +    ptr = malloc(size);
>> +    memset(ptr, 0, size);
>> +    return ptr;
>> +}
>> +
>> +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx,
>> +        const char *host, int port)
>> +{
>> +    int ret;
>> +    struct addrinfo *res;
>> +    char port_str[16];
>> +    struct rdma_cm_event *cm_event;
>> +
>> +
>> +    if (!strcmp(host, "")) {
>> +        printf("RDMA hostname has not been set\n");
>> +        return -1;
>> +    }
>> +
>> +    /* create CM channel */
>> +    rdma_ctx->channel = rdma_create_event_channel();
>> +    if (!rdma_ctx->channel) {
>> +        printf("could not create CM channel\n");
>> +        return -1;
>> +    }
>> +
>> +    /* create CM id */
>> +    ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL,
>> +            RDMA_PS_TCP);
>> +    if (ret) {
>> +        printf("could not create channel id\n");
>> +        goto err_resolve_create_id;
>> +    }
>> +
>> +    snprintf(port_str, 16, "%d", port);
>> +    port_str[15] = '\0';
>> +    ret = getaddrinfo(host, port_str, NULL, &res);
>> +    if (ret < 0) {
>> +        printf("could not getaddrinfo address %s\n", host);
>> +        goto err_resolve_get_addr;
>> +    }
>> +
>> +    /* resolve the first address */
>> +    ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr,
>> +            QEMU_RDMA_RESOLVE_TIMEOUT_MS);
>> +    if (ret) {
>> +        printf("could not resolve address %s\n", host);
>> +        goto err_resolve_get_addr;
>> +    }
>> +
>> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
>> +    if (ret) {
>> +        printf("could not perform event_addr_resolved\n");
>> +        goto err_resolve_get_addr;
>> +    }
>> +    if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>> +        printf("result not equal to event_addr_resolved\n");
>> +        rdma_ack_cm_event(cm_event);
>> +        goto err_resolve_get_addr;
>> +    }
>> +    rdma_ack_cm_event(cm_event);
>> +
>> +    /* resolve route */
>> +    ret = rdma_resolve_route(rdma_ctx->cm_id, QEMU_RDMA_RESOLVE_TIMEOUT_MS);
>> +    if (ret) {
>> +        printf("could not resolve rdma route\n");
>> +        goto err_resolve_get_addr;
>> +    }
>> +
>> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
>> +    if (ret) {
>> +        printf("could not perform event_route_resolved\n");
>> +        goto err_resolve_get_addr;
>> +    }
>> +    if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>> +        printf("result not equal to event_route_resolved\n");
>> +        rdma_ack_cm_event(cm_event);
>> +        goto err_resolve_get_addr;
>> +    }
>> +    rdma_ack_cm_event(cm_event);
>> +
>> +    rdma_ctx->verbs = rdma_ctx->cm_id->verbs;
>> +    return 0;
>> +
>> +err_resolve_get_addr:
>> +    rdma_destroy_id(rdma_ctx->cm_id);
>> +err_resolve_create_id:
>> +    rdma_destroy_event_channel(rdma_ctx->channel);
>> +    rdma_ctx->channel = NULL;
>> +
>> +    return -1;
>> +}
>> +
>> +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx)
>> +{
>> +
>> +    /* allocate pd */
>> +    rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs);
>> +    if (!rdma_ctx->pd) {
>> +        return -1;
>> +    }
>> +
>> +#ifdef QEMU_RDMA_MIGRATION_BLOCKING
>> +    /* create completion channel */
>> +    rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs);
>> +    if (!rdma_ctx->comp_channel) {
>> +        goto err_alloc_pd_cq;
>> +    }
>> +#endif
>> +
>> +    /* create cq */
>> +    rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, QEMU_RDMA_CQ_SIZE,
>> +            NULL, rdma_ctx->comp_channel, 0);
>> +    if (!rdma_ctx->cq) {
>> +        goto err_alloc_pd_cq;
>> +    }
>> +
>> +    return 0;
>> +
>> +err_alloc_pd_cq:
>> +    if (rdma_ctx->pd) {
>> +        ibv_dealloc_pd(rdma_ctx->pd);
>> +    }
>> +    if (rdma_ctx->comp_channel) {
>> +        ibv_destroy_comp_channel(rdma_ctx->comp_channel);
>> +    }
>> +    rdma_ctx->pd = NULL;
>> +    rdma_ctx->comp_channel = NULL;
>> +    return -1;
>> +
>> +}
>> +
>> +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx)
>> +{
>> +    struct ibv_qp_init_attr attr = { 0 };
>> +    int ret;
>> +
>> +    attr.cap.max_send_wr = QEMU_RDMA_QP_SIZE;
>> +    attr.cap.max_recv_wr = 2;
>> +    attr.cap.max_send_sge = 1;
>> +    attr.cap.max_recv_sge = 1;
>> +    attr.send_cq = rdma_ctx->cq;
>> +    attr.recv_cq = rdma_ctx->cq;
>> +    attr.qp_type = IBV_QPT_RC;
>> +
>> +    ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr);
>> +    if (ret) {
>> +        return -1;
>> +    }
>> +
>> +    rdma_ctx->qp = rdma_ctx->cm_id->qp;
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx,
>> +        void *in_data, int *in_len, void *out_data, int out_len)
>> +{
>> +    int ret;
>> +    struct rdma_conn_param conn_param = { 0 };
>> +    struct rdma_cm_event *cm_event;
>> +
>> +    conn_param.initiator_depth = 2;
>> +    conn_param.retry_count = 5;
>> +    conn_param.private_data = out_data;
>> +    conn_param.private_data_len = out_len;
>> +
>> +    ret = rdma_connect(rdma_ctx->cm_id, &conn_param);
>> +    if (ret) {
>> +        return -1;
>> +    }
>> +
>> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
>> +    if (ret) {
>> +        return -1;
>> +    }
>> +    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
>> +        return -1;
>> +    }
>> +
>> +    if (in_len) {
>> +        if (*in_len > cm_event->param.conn.private_data_len) {
>> +            *in_len = cm_event->param.conn.private_data_len;
>> +        }
>> +        if (*in_len) {
>> +            memcpy(in_data, cm_event->param.conn.private_data, *in_len);
>> +        }
>> +    }
>> +
>> +    rdma_ack_cm_event(cm_event);
>> +
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host,
>> +        int port)
>> +{
>> +    int ret;
>> +    struct rdma_cm_event *cm_event;
>> +    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
>> +    struct ibv_context *verbs;
>> +
>> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
>> +    if (ret) {
>> +        goto err_listen;
>> +    }
>> +
>> +    if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
>> +        rdma_ack_cm_event(cm_event);
>> +        goto err_listen;
>> +    }
>> +
>> +    rdma_ctx->cm_id = cm_event->id;
>> +    verbs = cm_event->id->verbs;
>> +    printf("verbs context after listen: %p\n", verbs);
>> +    rdma_ack_cm_event(cm_event);
>> +
>> +    if (!rdma_ctx->verbs) {
>> +        rdma_ctx->verbs = verbs;
>> +        ret = qemu_rdma_migration_server_prepare(mdata);
>> +        if (ret) {
>> +            fprintf(stderr, "rdma migration: error preparing server!\n");
>> +            goto err_listen;
>> +        }
>> +    } else if (rdma_ctx->verbs != verbs) {
>> +            fprintf(stderr, "ibv context not matching %p, %p!\n",
>> +                    rdma_ctx->verbs, verbs);
>> +            goto err_listen;
>> +    }
>> +    /* xxx destroy listen_id ??? */
>> +
>> +    return 0;
>> +
>> +err_listen:
>> +
>> +    return -1;
>> +
>> +}
>> +
>> +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx,
>> +        void *in_data, int *in_len, void *out_data, int out_len)
>> +{
>> +    int ret;
>> +    struct rdma_conn_param conn_param = { 0 };
>> +    struct rdma_cm_event *cm_event;
>> +
>> +    conn_param.responder_resources = 2;
>> +    conn_param.private_data = out_data;
>> +    conn_param.private_data_len = out_len;
>> +
>> +    ret = rdma_accept(rdma_ctx->cm_id, &conn_param);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma_accept returns %d!\n", ret);
>> +        return -1;
>> +    }
>> +
>> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
>> +    if (ret) {
>> +        return -1;
>> +    }
>> +
>> +    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
>> +        rdma_ack_cm_event(cm_event);
>> +        return -1;
>> +    }
>> +
>> +    if (in_len) {
>> +        if (*in_len > cm_event->param.conn.private_data_len) {
>> +            *in_len = cm_event->param.conn.private_data_len;
>> +        }
>> +        if (*in_len) {
>> +            memcpy(in_data, cm_event->param.conn.private_data, *in_len);
>> +        }
>> +    }
>> +
>> +    rdma_ack_cm_event(cm_event);
>> +
>> +    return 0;
>> +}
>> +
>> +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx)
>> +{
>> +    int ret;
>> +    struct rdma_cm_event *cm_event;
>> +
>> +    ret = rdma_disconnect(rdma_ctx->cm_id);
>> +    if (ret) {
>> +        return;
>> +    }
>> +    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
>> +    if (ret) {
>> +        return;
>> +    }
>> +    rdma_ack_cm_event(cm_event);
>> +}
>> +
>> +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx)
>> +{
>> +    if (rdma_ctx->qp) {
>> +        ibv_destroy_qp(rdma_ctx->qp);
>> +    }
>> +    if (rdma_ctx->cq) {
>> +        ibv_destroy_cq(rdma_ctx->cq);
>> +    }
>> +    if (rdma_ctx->comp_channel) {
>> +        ibv_destroy_comp_channel(rdma_ctx->comp_channel);
>> +    }
>> +    if (rdma_ctx->pd) {
>> +        ibv_dealloc_pd(rdma_ctx->pd);
>> +    }
>> +    if (rdma_ctx->listen_id) {
>> +        rdma_destroy_id(rdma_ctx->listen_id);
>> +    }
>> +    if (rdma_ctx->cm_id) {
>> +        rdma_destroy_id(rdma_ctx->cm_id);
>> +    }
>> +    if (rdma_ctx->channel) {
>> +        rdma_destroy_event_channel(rdma_ctx->channel);
>> +    }
>> +}
>> +
>> +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks)
>> +{
>> +    int i, j;
>> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
>> +        struct qemu_rdma_ram_block *block = &(rdma_ram_blocks->block[i]);
>> +        int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block);
>> +        /* allocate memory to store chunk MRs */
>> +        rdma_ram_blocks->block[i].pmr = qemu_rdma_mallocz(
>> +                                num_chunks * sizeof(struct ibv_mr *));
>> +
>> +        if (!block->pmr) {
>> +            goto err_reg_chunk_ram_blocks;
>> +        }
>> +
>> +        for (j = 0; j < num_chunks; j++) {
>> +            uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, j);
>> +            uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, j);
>> +            if (start_addr < block->local_host_addr) {
>> +                start_addr = block->local_host_addr;
>> +            }
>> +            if (end_addr > block->local_host_addr + block->length) {
>> +                end_addr = block->local_host_addr + block->length;
>> +            }
>> +            block->pmr[j] = ibv_reg_mr(rdma_ctx->pd,
>> +                                start_addr,
>> +                                end_addr - start_addr,
>> +                                IBV_ACCESS_LOCAL_WRITE |
>> +                                IBV_ACCESS_REMOTE_WRITE |
>> +                                IBV_ACCESS_REMOTE_READ);
>> +            if (!block->pmr[j]) {
>> +                break;
>> +            }
>> +        }
>> +        if (j < num_chunks) {
>> +            for (j--; j >= 0; j--) {
>> +                ibv_dereg_mr(block->pmr[j]);
>> +            }
>> +            block->pmr[i] = NULL;
>> +            goto err_reg_chunk_ram_blocks;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +
>> +err_reg_chunk_ram_blocks:
>> +    for (i--; i >= 0; i--) {
>> +        int num_chunks =
>> +            QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
>> +        for (j = 0; j < num_chunks; j++) {
>> +            ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
>> +        }
>> +        free(rdma_ram_blocks->block[i].pmr);
>> +        rdma_ram_blocks->block[i].pmr = NULL;
>> +    }
>> +
>> +    return -1;
>> +
>> +}
>> +
>> +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_blocks *rdma_ram_blocks)
>> +{
>> +    int i;
>> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
>> +        rdma_ram_blocks->block[i].mr =
>> +            ibv_reg_mr(rdma_ctx->pd,
>> +                    rdma_ram_blocks->block[i].local_host_addr,
>> +                    rdma_ram_blocks->block[i].length,
>> +                    IBV_ACCESS_LOCAL_WRITE |
>> +                    IBV_ACCESS_REMOTE_WRITE |
>> +                    IBV_ACCESS_REMOTE_READ);
>> +        if (!rdma_ram_blocks->block[i].mr) {
>> +                break;
>> +        }
>> +    }
>> +
>> +    if (i >= rdma_ram_blocks->num_blocks) {
>> +        return 0;
>> +    }
>> +
>> +    for (i--; i >= 0; i--) {
>> +        ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
>> +    }
>> +
>> +    return -1;
>> +
>> +}
>> +
>> +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +                struct qemu_rdma_ram_blocks *rdma_ram_blocks)
>> +{
>> +#ifdef QEMU_RDMA_CHUNK_REGISTRATION
>> +#ifdef QEMU_RDMA_LAZY_REGISTRATION
>> +    return 0;
>> +#else
>> +    return qemu_rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks);
>> +#endif
>> +#else
>> +    return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
>> +#endif
>> +}
>> +
>> +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
>> +                struct qemu_rdma_ram_blocks *rdma_ram_blocks)
>> +{
>> +    return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
>> +}
>> +
>> +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks)
>> +{
>> +    int i, j;
>> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
>> +        int num_chunks;
>> +        if (!rdma_ram_blocks->block[i].pmr) {
>> +            continue;
>> +        }
>> +        num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
>> +        for (j = 0; j < num_chunks; j++) {
>> +            if (!rdma_ram_blocks->block[i].pmr[j]) {
>> +                continue;
>> +            }
>> +            ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
>> +        }
>> +        free(rdma_ram_blocks->block[i].pmr);
>> +        rdma_ram_blocks->block[i].pmr = NULL;
>> +    }
>> +    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
>> +        if (!rdma_ram_blocks->block[i].mr) {
>> +            continue;
>> +        }
>> +        ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
>> +        rdma_ram_blocks->block[i].mr = NULL;
>> +    }
>> +}
>> +
>> +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
>> +                struct qemu_rdma_remote_ram_blocks *remote)
>> +{
>> +    int i;
>> +    remote->num_blocks = local->num_blocks;
>> +    for (i = 0; i < local->num_blocks; i++) {
>> +            remote->block[i].remote_host_addr =
>> +                (uint64_t)(local->block[i].local_host_addr);
>> +            remote->block[i].remote_rkey = local->block[i].mr->rkey;
>> +            remote->block[i].offset = local->block[i].offset;
>> +            remote->block[i].length = local->block[i].length;
>> +    }
>> +}
>> +
>> +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
>> +                struct qemu_rdma_remote_ram_blocks *remote)
>> +{
>> +    int i, j;
>> +
>> +    if (local->num_blocks != remote->num_blocks) {
>> +        return -1;
>> +    }
>> +
>> +    for (i = 0; i < remote->num_blocks; i++) {
>> +        /* search local ram blocks */
>> +        for (j = 0; j < local->num_blocks; j++) {
>> +            if (remote->block[i].offset != local->block[j].offset) {
>> +                continue;
>> +            }
>> +            if (remote->block[i].length != local->block[j].length) {
>> +                return -1;
>> +            }
>> +            local->block[j].remote_host_addr =
>> +                remote->block[i].remote_host_addr;
>> +            local->block[j].remote_rkey = remote->block[i].remote_rkey;
>> +            break;
>> +        }
>> +        if (j >= local->num_blocks) {
>> +            return -1;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length,
>> +                struct qemu_rdma_ram_blocks *blocks,
>> +                int *block_index, int *chunk_index)
>> +{
>> +    int i;
>> +    for (i = 0; i < blocks->num_blocks; i++) {
>> +        if (offset < blocks->block[i].offset) {
>> +            continue;
>> +        }
>> +        if (offset + length >
>> +                blocks->block[i].offset + blocks->block[i].length) {
>> +            continue;
>> +        }
>> +        *block_index = i;
>> +        if (chunk_index) {
>> +            uint8_t *host_addr = blocks->block[i].local_host_addr +
>> +                (offset - blocks->block[i].offset);
>> +            *chunk_index = QEMU_RDMA_REG_CHUNK_INDEX(
>> +                    blocks->block[i].local_host_addr, host_addr);
>> +        }
>> +        return 0;
>> +    }
>> +    return -1;
>> +}
>> +
>> +static int qemu_rdma_get_lkey(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_block *block, uint64_t host_addr,
>> +        uint32_t *lkey)
>> +{
>> +    int chunk;
>> +    if (block->mr) {
>> +        *lkey = block->mr->lkey;
>> +        return 0;
>> +    }
>> +    if (!block->pmr) {
>> +        int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block);
>> +        /* allocate memory to store chunk MRs */
>> +        block->pmr = qemu_rdma_mallocz(num_chunks *
>> +                sizeof(struct ibv_mr *));
>> +        if (!block->pmr) {
>> +            return -1;
>> +        }
>> +    }
>> +    chunk = QEMU_RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr);
>> +    if (!block->pmr[chunk]) {
>> +        uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, chunk);
>> +        uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, chunk);
>> +        if (start_addr < block->local_host_addr) {
>> +            start_addr = block->local_host_addr;
>> +        }
>> +        if (end_addr > block->local_host_addr + block->length) {
>> +            end_addr = block->local_host_addr + block->length;
>> +        }
>> +        block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd,
>> +                start_addr,
>> +                end_addr - start_addr,
>> +                IBV_ACCESS_LOCAL_WRITE |
>> +                IBV_ACCESS_REMOTE_WRITE |
>> +                IBV_ACCESS_REMOTE_READ);
>> +        if (!block->pmr[chunk]) {
>> +            return -1;
>> +        }
>> +    }
>> +    *lkey = block->pmr[chunk]->lkey;
>> +    return 0;
>> +}
>> +
>> +static int qemu_rdma_write(struct qemu_rdma_context *rdma_ctx,
>> +        struct qemu_rdma_ram_block *block,
>> +        uint64_t offset, uint64_t length,
>> +        uint64_t wr_id, enum ibv_send_flags flag)
>> +{
>> +    struct ibv_sge sge;
>> +    struct ibv_send_wr send_wr = { 0 };
>> +    struct ibv_send_wr *bad_wr;
>> +
>> +    sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset));
>> +    sge.length = length;
>> +    if (qemu_rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) {
>> +        fprintf(stderr, "cannot get lkey!\n");
>> +        return -1;
>> +    }
>> +    send_wr.wr_id = wr_id;
>> +    send_wr.opcode = IBV_WR_RDMA_WRITE;
>> +    send_wr.send_flags = flag;
>> +    send_wr.sg_list = &sge;
>> +    send_wr.num_sge = 1;
>> +    send_wr.wr.rdma.rkey = block->remote_rkey;
>> +    send_wr.wr.rdma.remote_addr = block->remote_host_addr +
>> +        (offset - block->offset);
>> +
>> +    if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) {
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +/* Do not merge data if larger than this. */
>> +#define QEMU_RDMA_MIGRATION_MERGE_MAX (4 * 1024 * 1024)
>> +
>> +#define QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX 64
>> +
>> +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
>> +            &mdata->sync,
>> +            sizeof mdata->sync,
>> +            IBV_ACCESS_LOCAL_WRITE |
>> +            IBV_ACCESS_REMOTE_WRITE |
>> +            IBV_ACCESS_REMOTE_READ);
>> +    if (mdata->sync_mr) {
>> +        return 0;
>> +    }
>> +    return -1;
>> +}
>> +
>> +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    return ibv_dereg_mr(mdata->sync_mr);
>> +}
>> +
>> +int qemu_rdma_migration_reg_remote_info(
>> +        struct qemu_rdma_migration_data *mdata)
>> +{
>> +    mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
>> +            &mdata->remote_info,
>> +            sizeof mdata->remote_info,
>> +            IBV_ACCESS_LOCAL_WRITE |
>> +            IBV_ACCESS_REMOTE_WRITE |
>> +            IBV_ACCESS_REMOTE_READ);
>> +    if (mdata->remote_info_mr) {
>> +        return 0;
>> +    }
>> +    return -1;
>> +}
>> +
>> +int qemu_rdma_migration_dereg_remote_info(
>> +        struct qemu_rdma_migration_data *mdata)
>> +{
>> +    return ibv_dereg_mr(mdata->remote_info_mr);
>> +}
>> +
>> +
>> +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata,
>> +        int wr_id)
>> +{
>> +    struct ibv_sge sge;
>> +    struct ibv_send_wr send_wr = { 0 };
>> +    struct ibv_send_wr *bad_wr;
>> +
>> +    mdata->sync = 1;
>> +
>> +    sge.addr = (uint64_t)(&mdata->sync);
>> +    sge.length = sizeof mdata->sync;
>> +    sge.lkey = mdata->sync_mr->lkey;
>> +
>> +    send_wr.wr_id = wr_id;
>> +    send_wr.opcode = IBV_WR_SEND;
>> +    send_wr.send_flags = IBV_SEND_SIGNALED;
>> +    send_wr.sg_list = &sge;
>> +    send_wr.num_sge = 1;
>> +
>> +    if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata,
>> +        int wr_id)
>> +{
>> +    struct ibv_sge sge;
>> +    struct ibv_recv_wr recv_wr = { 0 };
>> +    struct ibv_recv_wr *bad_wr;
>> +
>> +    mdata->sync = 1;
>> +
>> +    sge.addr = (uint64_t)(&mdata->sync);
>> +    sge.length = sizeof mdata->sync;
>> +    sge.lkey = mdata->sync_mr->lkey;
>> +
>> +    recv_wr.wr_id = wr_id;
>> +    recv_wr.sg_list = &sge;
>> +    recv_wr.num_sge = 1;
>> +
>> +    if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +
>> +}
>> +
>> +int qemu_rdma_migration_post_send_remote_info(
>> +        struct qemu_rdma_migration_data *mdata)
>> +{
>> +    struct ibv_sge sge;
>> +    struct ibv_send_wr send_wr = { 0 };
>> +    struct ibv_send_wr *bad_wr;
>> +
>> +    sge.addr = (uint64_t)(&mdata->remote_info);
>> +    sge.length = sizeof mdata->remote_info;
>> +    sge.lkey = mdata->remote_info_mr->lkey;
>> +
>> +    send_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO;
>> +    send_wr.opcode = IBV_WR_SEND;
>> +    send_wr.send_flags = IBV_SEND_SIGNALED;
>> +    send_wr.sg_list = &sge;
>> +    send_wr.num_sge = 1;
>> +
>> +    if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
>> +        return -1;
>> +    }
>> +
>> +    mdata->num_signaled_send--;
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_migration_post_recv_remote_info(
>> +        struct qemu_rdma_migration_data *mdata)
>> +{
>> +    struct ibv_sge sge;
>> +    struct ibv_recv_wr recv_wr = { 0 };
>> +    struct ibv_recv_wr *bad_wr;
>> +
>> +    sge.addr = (uint64_t)(&mdata->remote_info);
>> +    sge.length = sizeof mdata->remote_info;
>> +    sge.lkey = mdata->remote_info_mr->lkey;
>> +
>> +    recv_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO;
>> +    recv_wr.sg_list = &sge;
>> +    recv_wr.num_sge = 1;
>> +
>> +    if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
>> +        return -1;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +
>> +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    int ret;
>> +    enum ibv_send_flags flags = 0;
>> +
>> +    if (!mdata->current_length) {
>> +        return 0;
>> +    }
>> +    if (mdata->num_unsignaled_send >=
>> +            QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) {
>> +        flags = IBV_SEND_SIGNALED;
>> +    }
>> +    ret = qemu_rdma_write(&mdata->rdma_ctx,
>> +            &(mdata->rdma_ram_blocks.block[mdata->current_index]),
>> +            mdata->current_offset,
>> +            mdata->current_length,
>> +            QEMU_RDMA_MIGRATION_WRID_RDMA, flags);
>> +
>> +    if (ret) {
>> +        return ret;
>> +    }
>> +
>> +    if (mdata->num_unsignaled_send >=
>> +            QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) {
>> +        mdata->num_unsignaled_send = 0;
>> +        mdata->num_signaled_send++;
>> +    } else {
>> +        mdata->num_unsignaled_send++;
>> +    }
>> +
>> +    mdata->total_bytes += mdata->current_length;
>> +    mdata->current_length = 0;
>> +    mdata->current_offset = 0;
>> +
>> +    return 0;
>> +}
>> +
>> +static inline int qemu_rdma_migration_in_current_block(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        uint64_t offset, uint64_t len)
>> +{
>> +    struct qemu_rdma_ram_block *block =
>> +        &(mdata->rdma_ram_blocks.block[mdata->current_index]);
>> +    if (mdata->current_index < 0) {
>> +        return 0;
>> +    }
>> +    if (offset < block->offset) {
>> +        return 0;
>> +    }
>> +    if (offset + len > block->offset + block->length) {
>> +        return 0;
>> +    }
>> +    return 1;
>> +}
>> +
>> +static inline int qemu_rdma_migration_in_current_chunk(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        uint64_t offset, uint64_t len)
>> +{
>> +    struct qemu_rdma_ram_block *block =
>> +        &(mdata->rdma_ram_blocks.block[mdata->current_index]);
>> +    uint8_t *chunk_start, *chunk_end, *host_addr;
>> +    if (mdata->current_chunk < 0) {
>> +        return 0;
>> +    }
>> +    host_addr = block->local_host_addr + (offset - block->offset);
>> +    chunk_start = QEMU_RDMA_REG_CHUNK_START(block, mdata->current_chunk);
>> +    if (chunk_start < block->local_host_addr) {
>> +        chunk_start = block->local_host_addr;
>> +    }
>> +    if (host_addr < chunk_start) {
>> +        return 0;
>> +    }
>> +    chunk_end = QEMU_RDMA_REG_CHUNK_END(block, mdata->current_chunk);
>> +    if (chunk_end > chunk_start + block->length) {
>> +        chunk_end = chunk_start + block->length;
>> +    }
>> +    if (host_addr + len > chunk_end) {
>> +        return 0;
>> +    }
>> +    return 1;
>> +}
>> +
>> +static inline int qemu_rdma_buffer_mergable(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        uint64_t offset, uint64_t len)
>> +{
>> +    if (mdata->current_length == 0) {
>> +        return 0;
>> +    }
>> +    if (offset != mdata->current_offset + mdata->current_length) {
>> +        return 0;
>> +    }
>> +    if (!qemu_rdma_migration_in_current_block(mdata, offset, len)) {
>> +        return 0;
>> +    }
>> +#ifdef QEMU_RDMA_CHUNK_REGISTRATION
>> +    if (!qemu_rdma_migration_in_current_chunk(mdata, offset, len)) {
>> +        return 0;
>> +    }
>> +#endif
>> +    return 1;
>> +}
>> +
>> +/* Note that buffer must be within a single block/chunk. */
>> +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata,
>> +                uint64_t offset, uint64_t len)
>> +{
>> +    int index = mdata->current_index;
>> +    int chunk_index = mdata->current_chunk;
>> +    int ret;
>> +
>> +    /* If we cannot merge it, we flush the current buffer first. */
>> +    if (!qemu_rdma_buffer_mergable(mdata, offset, len)) {
>> +        ret = qemu_rdma_migration_write_flush(mdata);
>> +        if (ret) {
>> +            return ret;
>> +        }
>> +        mdata->current_length = 0;
>> +        mdata->current_offset = offset;
>> +
>> +        if (qemu_rdma_search_ram_block(offset, len,
>> +                    &mdata->rdma_ram_blocks, &index, &chunk_index)) {
>> +            return -1;
>> +        }
>> +        mdata->current_index = index;
>> +        mdata->current_chunk = chunk_index;
>> +    }
>> +
>> +    /* merge it */
>> +    mdata->current_length += len;
>> +
>> +    /* flush it if buffer is too large */
>> +    if (mdata->current_length >= QEMU_RDMA_MIGRATION_MERGE_MAX) {
>> +        return qemu_rdma_migration_write_flush(mdata);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    int ret;
>> +    struct ibv_wc wc;
>> +
>> +    ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc);
>> +    if (!ret) {
>> +        return QEMU_RDMA_MIGRATION_WRID_NONE;
>> +    }
>> +    if (ret < 0) {
>> +        fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
>> +        return ret;
>> +    }
>> +    if (wc.status != IBV_WC_SUCCESS) {
>> +        fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
>> +                        wc.status, ibv_wc_status_str(wc.status));
>> +        fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id);
>> +
>> +        return -1;
>> +    }
>> +
>> +    if (!(wc.opcode & IBV_WC_RECV)) {
>> +        mdata->num_signaled_send--;
>> +    }
>> +
>> +    return  (int)wc.wr_id;
>> +}
>> +
>> +int qemu_rdma_migration_wait_for_wrid(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        int wrid)
>> +{
>> +#ifdef QEMU_RDMA_MIGRATION_BLOCKING
>> +    return qemu_rdma_migration_block_for_wrid(mdata, wrid);
>> +#else
>> +    return qemu_rdma_migration_poll_for_wrid(mdata, wrid);
>> +#endif
>> +}
>> +
>> +int qemu_rdma_migration_poll_for_wrid(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        int wrid)
>> +{
>> +    int r = QEMU_RDMA_MIGRATION_WRID_NONE;
>> +    while (r != wrid) {
>> +        r = qemu_rdma_migration_poll(mdata);
>> +        if (r < 0) {
>> +            return r;
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +int qemu_rdma_migration_block_for_wrid(
>> +        struct qemu_rdma_migration_data *mdata,
>> +        int wrid)
>> +{
>> +    int num_cq_events = 0;
>> +    int r = QEMU_RDMA_MIGRATION_WRID_NONE;
>> +    struct ibv_cq *cq;
>> +    void *cq_ctx;
>> +
>> +    if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) {
>> +        return -1;
>> +    }
>> +    /* poll cq first */
>> +    while (r != wrid) {
>> +        r = qemu_rdma_migration_poll(mdata);
>> +        if (r < 0) {
>> +            return r;
>> +        }
>> +        if (r == QEMU_RDMA_MIGRATION_WRID_NONE) {
>> +            break;
>> +        }
>> +    }
>> +    if (r == wrid) {
>> +        return 0;
>> +    }
>> +
>> +    while (1) {
>> +        if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel,
>> +                    &cq, &cq_ctx)) {
>> +            goto err_block_for_wrid;
>> +        }
>> +        num_cq_events++;
>> +        if (ibv_req_notify_cq(cq, 0)) {
>> +            goto err_block_for_wrid;
>> +        }
>> +        /* poll cq */
>> +        while (r != wrid) {
>> +            r = qemu_rdma_migration_poll(mdata);
>> +            if (r < 0) {
>> +                goto err_block_for_wrid;
>> +            }
>> +            if (r == QEMU_RDMA_MIGRATION_WRID_NONE) {
>> +                break;
>> +            }
>> +        }
>> +        if (r == wrid) {
>> +            goto success_block_for_wrid;
>> +        }
>> +    }
>> +
>> +success_block_for_wrid:
>> +    if (num_cq_events) {
>> +        ibv_ack_cq_events(cq, num_cq_events);
>> +    }
>> +    return 0;
>> +
>> +err_block_for_wrid:
>> +    if (num_cq_events) {
>> +        ibv_ack_cq_events(cq, num_cq_events);
>> +    }
>> +    return -1;
>> +}
>> +
>> +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    mdata->enabled = 0;
>> +    if (mdata->sync_mr) {
>> +        qemu_rdma_migration_dereg_sync(mdata);
>> +    }
>> +    if (mdata->remote_info_mr) {
>> +        qemu_rdma_migration_dereg_remote_info(mdata);
>> +    }
>> +    mdata->sync_mr = NULL;
>> +    mdata->remote_info_mr = NULL;
>> +    qemu_rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks);
>> +    mdata->rdma_ram_blocks.num_blocks = 0;
>> +    qemu_rdma_cleanup(&mdata->rdma_ctx);
>> +    qemu_rdma_migration_data_init(mdata);
>> +}
>> +
>> +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    int ret;
>> +
>> +    if (mdata->client_init_done) {
>> +        return 0;
>> +    }
>> +
>> +    ret = qemu_rdma_resolve_host(&mdata->rdma_ctx,
>> +            mdata->host, mdata->port);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error resolving host!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_alloc_pd_cq(&mdata->rdma_ctx);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error allocating qp!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_client_reg_ram_blocks(&mdata->rdma_ctx,
>> +            &mdata->rdma_ram_blocks);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error registering ram blocks!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_migration_reg_sync(mdata);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error registering sync data!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_migration_reg_remote_info(mdata);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error registering remote info!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    ret = qemu_rdma_migration_post_recv_remote_info(mdata);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error posting remote info recv!\n");
>> +        goto err_rdma_client_init;
>> +    }
>> +
>> +    mdata->client_init_done = 1;
>> +    return 0;
>> +
>> +err_rdma_client_init:
>> +    qemu_rdma_migration_cleanup(mdata);
>> +    return -1;
>> +}
>> +
>> +
>> +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata)
>> +{
>> +
>> +    int ret;
>> +
>> +    ret = qemu_rdma_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error connecting!\n");
>> +        goto err_rdma_client_connect;
>> +    }
>> +
>> +    /* wait for remote info */
>> +    ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>> +            QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "rdma migration: polling remote info error!\n");
>> +        goto err_rdma_client_connect;
>> +    }
>> +
>> +    ret = qemu_rdma_process_remote_ram_blocks(
>> +            &mdata->rdma_ram_blocks, &mdata->remote_info);
>> +    if (ret) {
>> +        fprintf(stderr,
>> +                "rdma migration: error processing remote ram blocks!\n");
>> +        goto err_rdma_client_connect;
>> +    }
>> +
>> +    rdma_mdata.total_bytes = 0;
>> +    rdma_mdata.enabled = 1;
>> +    return 0;
>> +
>> +err_rdma_client_connect:
>> +    qemu_rdma_migration_cleanup(mdata);
>> +    return -1;
>> +}
>> +
>> +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata)
>> +{
>> +
>> +    int ret;
>> +    struct sockaddr_in sin;
>> +    struct rdma_cm_id *listen_id;
>> +    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
>> +
>> +    /* create CM channel */
>> +    rdma_ctx->channel = rdma_create_event_channel();
>> +    if (!rdma_ctx->channel) {
>> +        return -1;
>> +    }
>> +
>> +    /* create CM id */
>> +    ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL,
>> +            RDMA_PS_TCP);
>> +    if (ret) {
>> +        goto err_server_init_create_listen_id;
>> +    }
>> +
>> +    memset(&sin, 0, sizeof(sin));
>> +    sin.sin_family = AF_INET;
>> +    sin.sin_port = htons(mdata->port);
>> +    if (strcmp("", mdata->host)) {
>> +        struct hostent *server_addr;
>> +        server_addr = gethostbyname(mdata->host);
>> +        if (!server_addr) {
>> +            goto err_server_init_bind_addr;
>> +        }
>> +        memcpy(&sin.sin_addr.s_addr, server_addr->h_addr,
>> +                server_addr->h_length);
>> +    } else {
>> +        sin.sin_addr.s_addr = INADDR_ANY;
>> +    }
>> +
>> +    ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin);
>> +    if (ret) {
>> +        goto err_server_init_bind_addr;
>> +    }
>> +    printf("verbs context after binding: %p\n", listen_id->verbs);
>> +
>> +    rdma_ctx->listen_id = listen_id;
>> +    if (listen_id->verbs) {
>> +        rdma_ctx->verbs = listen_id->verbs;
>> +    }
>> +    return 0;
>> +
>> +err_server_init_bind_addr:
>> +    rdma_destroy_id(listen_id);
>> +err_server_init_create_listen_id:
>> +    rdma_destroy_event_channel(rdma_ctx->channel);
>> +    rdma_ctx->channel = NULL;
>> +
>> +    return -1;
>> +
>> +}
>> +
>> +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    int ret;
>> +    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
>> +
>> +    if (!rdma_ctx->verbs) {
>> +        return 0;
>> +    }
>> +
>> +    ret = qemu_rdma_alloc_pd_cq(rdma_ctx);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
>> +        goto err_rdma_server_prepare;
>> +    }
>> +
>> +    ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
>> +        goto err_rdma_server_prepare;
>> +    }
>> +
>> +    ret = qemu_rdma_server_reg_ram_blocks(rdma_ctx,
>> +            &mdata->rdma_ram_blocks);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error registering ram blocks!\n");
>> +        goto err_rdma_server_prepare;
>> +    }
>> +
>> +    ret = qemu_rdma_migration_reg_sync(mdata);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error registering sync data!\n");
>> +        goto err_rdma_server_prepare;
>> +    }
>> +
>> +    qemu_rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks,
>> +            &mdata->remote_info);
>> +
>> +    ret = qemu_rdma_migration_reg_remote_info(mdata);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error registering remote info!\n");
>> +        goto err_rdma_server_prepare;
>> +    }
>> +
>> +    ret = rdma_listen(rdma_ctx->listen_id, 5);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error listening on socket!\n");
>> +        goto err_rdma_server_prepare;
>> +    }
>> +
>> +    return 0;
>> +
>> +err_rdma_server_prepare:
>> +    qemu_rdma_migration_cleanup(mdata);
>> +    return -1;
>> +}
>> +
>> +int qemu_rdma_migration_server_wait_for_client(
>> +                                    struct qemu_rdma_migration_data *mdata)
>> +{
>> +
>> +    int ret;
>> +
>> +    ret = qemu_rdma_listen(mdata, mdata->host, mdata->port);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error listening!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +
>> +    ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error allocating qp!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +
>> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
>> +    ret = qemu_rdma_migration_post_recv_sync(mdata,
>> +            QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error posting extra sync receive!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +#endif
>> +
>> +    ret = qemu_rdma_migration_post_recv_sync(mdata,
>> +            QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error posting sync receive!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +
>> +    ret = qemu_rdma_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error accepting connection!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +
>> +    /* send remote info */
>> +    ret = qemu_rdma_migration_post_send_remote_info(mdata);
>> +    if (ret) {
>> +        fprintf(stderr, "rdma migration: error sending remote info!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +
>> +    /* wait for completion */
>> +    ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>> +            QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO);
>> +    if (ret < 0) {
>> +        fprintf(stderr, "rdma migration: polling remote info error!\n");
>> +        goto err_rdma_server_wait;
>> +    }
>> +
>> +    rdma_mdata.total_bytes = 0;
>> +    rdma_mdata.enabled = 1;
>> +    return 0;
>> +
>> +err_rdma_server_wait:
>> +    qemu_rdma_migration_cleanup(mdata);
>> +    return -1;
>> +
>> +}
>> +
>> +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata)
>> +{
>> +    qemu_rdma_init_context(&mdata->rdma_ctx);
>> +    mdata->port = rdmaport;
>> +    mdata->host = rdmahost;
>> +    mdata->enabled = 0;
>> +    mdata->rdma_ram_blocks.num_blocks = 0;
>> +    mdata->client_init_done = 0;
>> +    mdata->num_unsignaled_send = 0;
>> +    mdata->num_signaled_send = 0;
>> +    mdata->current_offset = 0;
>> +    mdata->current_length = 0;
>> +    mdata->current_index = -1;
>> +    mdata->current_chunk = -1;
>> +    mdata->sync = 0;
>> +    mdata->sync_mr = NULL;
>> +    mdata->remote_info_mr = NULL;
>> +}
>> +
>> +void qemu_rdma_migration_disable(void)
>> +{
>> +    rdma_mdata.port = -1;
>> +    rdma_mdata.enabled = 0;
>> +}
>>
>
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index 760da1e..d1d6b8c 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -112,12 +112,13 @@  obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o
 obj-y += hw/
 obj-$(CONFIG_KVM) += kvm-all.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
-obj-y += memory.o savevm.o cputlb.o
+# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o
+obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o
 obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o
 obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o
 obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o
 obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o
-LIBS+=-lz
+LIBS+=-lz -lrdmacm
 
 # xen support
 obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o
diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h
new file mode 100644
index 0000000..099622e
--- /dev/null
+++ b/include/qemu/rdma.h
@@ -0,0 +1,249 @@ 
+/*
+ * RDMA data structures and helper functions header (for migration)
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *  Michael R. Hines <mrhines@us.ibm.com>
+ *  Jiuxing Liu <jl@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef _QEMU_RDMA_H
+#define _QEMU_RDMA_H
+
+#include <rdma/rdma_cma.h>
+#include "monitor/monitor.h"
+
+extern int rdmaport;
+extern char rdmahost[64];
+
+struct qemu_rdma_context {
+    /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in
+       cm_id->verbs, cm_id->channel, and cm_id->qp. */
+    struct rdma_cm_id *cm_id;
+    struct rdma_cm_id *listen_id;
+
+    struct ibv_context *verbs;
+    struct rdma_event_channel *channel;
+    struct ibv_qp *qp;
+
+    struct ibv_comp_channel *comp_channel;
+    struct ibv_pd *pd;
+    struct ibv_cq *cq;
+};
+
+static inline void qemu_rdma_init_context(struct qemu_rdma_context *rdma_ctx)
+{
+    rdma_ctx->cm_id = NULL;
+    rdma_ctx->listen_id = NULL;
+    rdma_ctx->verbs = NULL;
+    rdma_ctx->channel = NULL;
+    rdma_ctx->qp = NULL;
+    rdma_ctx->comp_channel = NULL;
+    rdma_ctx->pd = NULL;
+    rdma_ctx->cq = NULL;
+}
+
+void cpu_physical_memory_reset_dirty_all(void);
+
+int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx,
+        const char *host, int port);
+int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx);
+int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx);
+int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx,
+        void *in_data, int *in_len, void *out_data, int out_len);
+int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx,
+        void *in_data, int *in_len, void *out_data, int out_len);
+void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx);
+void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx);
+
+/* Instead of registering whole ram blocks, we can register them in smaller
+ * chunks. This may be benefial if the ram blocks have holes in them */
+#define QEMU_RDMA_CHUNK_REGISTRATION
+
+#define QEMU_RDMA_LAZY_REGISTRATION
+
+#define QEMU_RDMA_REG_CHUNK_SHIFT 20
+#define QEMU_RDMA_REG_CHUNK_SIZE (1UL << (QEMU_RDMA_REG_CHUNK_SHIFT))
+#define QEMU_RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \
+            (((unsigned long)(host_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT) - \
+            ((unsigned long)(start_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT))
+#define QEMU_RDMA_REG_NUM_CHUNKS(rdma_ram_block) \
+            (QEMU_RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\
+                (rdma_ram_block)->local_host_addr +\
+                (rdma_ram_block)->length) + 1)
+#define QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\
+            ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \
+                QEMU_RDMA_REG_CHUNK_SHIFT) + (i)) << \
+                QEMU_RDMA_REG_CHUNK_SHIFT))
+#define QEMU_RDMA_REG_CHUNK_END(rdma_ram_block, i) \
+            (QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) + \
+             QEMU_RDMA_REG_CHUNK_SIZE)
+
+struct qemu_rdma_ram_block {
+    uint8_t *local_host_addr;
+    uint64_t remote_host_addr;
+    uint64_t offset;
+    uint64_t length;
+    struct ibv_mr **pmr;
+    struct ibv_mr *mr;
+    uint32_t remote_rkey;
+};
+
+struct qemu_rdma_remote_ram_block {
+    uint64_t remote_host_addr;
+    uint64_t offset;
+    uint64_t length;
+    uint32_t remote_rkey;
+};
+
+#define QEMU_MAX_RAM_BLOCKS 64
+
+struct qemu_rdma_ram_blocks {
+    int num_blocks;
+    struct qemu_rdma_ram_block block[QEMU_MAX_RAM_BLOCKS];
+};
+
+struct qemu_rdma_remote_ram_blocks {
+    int num_blocks;
+    struct qemu_rdma_remote_ram_block block[QEMU_MAX_RAM_BLOCKS];
+};
+
+int qemu_rdma_init_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks);
+int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
+int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
+int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
+int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_blocks *rdma_ram_blocks);
+void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks);
+
+void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
+        struct qemu_rdma_remote_ram_blocks *remote);
+int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
+        struct qemu_rdma_remote_ram_blocks *remote);
+
+int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length,
+        struct qemu_rdma_ram_blocks *blocks,
+        int *block_index, int *chunk_index);
+
+struct qemu_rdma_migration_data {
+    char *host;
+    int port;
+    int enabled;
+
+    struct qemu_rdma_context rdma_ctx;
+    struct qemu_rdma_ram_blocks rdma_ram_blocks;
+
+    /* This is used for synchronization: We use
+       IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs
+       are done. When the receiver gets it, it can be certain
+       that all the RDMAs are completed. */
+    int sync;
+    struct ibv_mr *sync_mr;
+
+    /* This is used for the server to write the remote
+       ram blocks info. */
+    struct qemu_rdma_remote_ram_blocks remote_info;
+    struct ibv_mr *remote_info_mr;
+
+    /* The rest is only for the initiator of the migration. */
+    int client_init_done;
+
+    /* number of outstanding unsignaled send */
+    int num_unsignaled_send;
+
+    /* number of outstanding signaled send */
+    int num_signaled_send;
+
+    /* store info about current buffer so that we can
+       merge it with future sends */
+    uint64_t current_offset;
+    uint64_t current_length;
+    /* index of ram block the current buffer belongs to */
+    int current_index;
+    /* index of the chunk in the current ram block */
+    int current_chunk;
+
+    uint64_t total_bytes;
+
+};
+
+extern struct qemu_rdma_migration_data rdma_mdata;
+
+void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata);
+
+static inline int qemu_use_rdma_migration(void)
+{
+    /* port will be non-zero if user wants to use RDMA. */
+    return rdma_mdata.port != -1;
+}
+
+static inline int qemu_rdma_migration_enabled(void)
+{
+    return rdma_mdata.enabled;
+}
+
+void qemu_rdma_migration_disable(void);
+
+#define QEMU_RDMA_MIGRATION_BLOCKING
+#define QEMU_RDMA_MIGRATION_EXTRA_SYNC
+
+enum {
+    QEMU_RDMA_MIGRATION_WRID_NONE = 0,
+    QEMU_RDMA_MIGRATION_WRID_RDMA,
+    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC,
+    QEMU_RDMA_MIGRATION_WRID_RECV_SYNC,
+    QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO,
+    QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO,
+    QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC,
+    QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC,
+};
+
+int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host,
+                                                              int port);
+int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata,
+        int wr_id);
+int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata,
+        int wr_id);
+
+int qemu_rdma_migration_reg_remote_info(
+        struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_dereg_remote_info(
+        struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_post_send_remote_info(
+        struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_post_recv_remote_info(
+        struct qemu_rdma_migration_data *mdata);
+
+int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata,
+        uint64_t addr, uint64_t len);
+int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_wait_for_wrid(
+        struct qemu_rdma_migration_data *mdata,
+        int wrid);
+int qemu_rdma_migration_poll_for_wrid(
+        struct qemu_rdma_migration_data *mdata,
+        int wrid);
+int qemu_rdma_migration_block_for_wrid(
+        struct qemu_rdma_migration_data *mdata,
+        int wrid);
+void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata);
+
+int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata);
+int qemu_rdma_migration_server_wait_for_client(
+                                       struct qemu_rdma_migration_data *mdata);
+
+#endif
diff --git a/qemu-rdma.c b/qemu-rdma.c
new file mode 100644
index 0000000..5f16875
--- /dev/null
+++ b/qemu-rdma.c
@@ -0,0 +1,1357 @@ 
+/*
+ * RDMA data structures and helper functions (for migration)
+ *
+ * Copyright IBM, Corp. 2013
+ *
+ * Authors:
+ *  Michael R. Hines <mrhines@us.ibm.com>
+ *  Jiuxing Liu <jl@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/rdma.h"
+#include "qemu-common.h"
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#define QEMU_RDMA_RESOLVE_TIMEOUT_MS 10000
+#define QEMU_RDMA_CQ_SIZE 2000
+#define QEMU_RDMA_QP_SIZE 1000
+
+int rdmaport = -1;
+char rdmahost[64] = "";
+struct qemu_rdma_migration_data rdma_mdata;
+
+static void *qemu_rdma_mallocz(size_t size)
+{
+    void *ptr;
+    ptr = malloc(size);
+    memset(ptr, 0, size);
+    return ptr;
+}
+
+int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx,
+        const char *host, int port)
+{
+    int ret;
+    struct addrinfo *res;
+    char port_str[16];
+    struct rdma_cm_event *cm_event;
+
+
+    if (!strcmp(host, "")) {
+        printf("RDMA hostname has not been set\n");
+        return -1;
+    }
+
+    /* create CM channel */
+    rdma_ctx->channel = rdma_create_event_channel();
+    if (!rdma_ctx->channel) {
+        printf("could not create CM channel\n");
+        return -1;
+    }
+
+    /* create CM id */
+    ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL,
+            RDMA_PS_TCP);
+    if (ret) {
+        printf("could not create channel id\n");
+        goto err_resolve_create_id;
+    }
+
+    snprintf(port_str, 16, "%d", port);
+    port_str[15] = '\0';
+    ret = getaddrinfo(host, port_str, NULL, &res);
+    if (ret < 0) {
+        printf("could not getaddrinfo address %s\n", host);
+        goto err_resolve_get_addr;
+    }
+
+    /* resolve the first address */
+    ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr,
+            QEMU_RDMA_RESOLVE_TIMEOUT_MS);
+    if (ret) {
+        printf("could not resolve address %s\n", host);
+        goto err_resolve_get_addr;
+    }
+
+    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
+    if (ret) {
+        printf("could not perform event_addr_resolved\n");
+        goto err_resolve_get_addr;
+    }
+    if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
+        printf("result not equal to event_addr_resolved\n");
+        rdma_ack_cm_event(cm_event);
+        goto err_resolve_get_addr;
+    }
+    rdma_ack_cm_event(cm_event);
+
+    /* resolve route */
+    ret = rdma_resolve_route(rdma_ctx->cm_id, QEMU_RDMA_RESOLVE_TIMEOUT_MS);
+    if (ret) {
+        printf("could not resolve rdma route\n");
+        goto err_resolve_get_addr;
+    }
+
+    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
+    if (ret) {
+        printf("could not perform event_route_resolved\n");
+        goto err_resolve_get_addr;
+    }
+    if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
+        printf("result not equal to event_route_resolved\n");
+        rdma_ack_cm_event(cm_event);
+        goto err_resolve_get_addr;
+    }
+    rdma_ack_cm_event(cm_event);
+
+    rdma_ctx->verbs = rdma_ctx->cm_id->verbs;
+    return 0;
+
+err_resolve_get_addr:
+    rdma_destroy_id(rdma_ctx->cm_id);
+err_resolve_create_id:
+    rdma_destroy_event_channel(rdma_ctx->channel);
+    rdma_ctx->channel = NULL;
+
+    return -1;
+}
+
+int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx)
+{
+
+    /* allocate pd */
+    rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs);
+    if (!rdma_ctx->pd) {
+        return -1;
+    }
+
+#ifdef QEMU_RDMA_MIGRATION_BLOCKING
+    /* create completion channel */
+    rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs);
+    if (!rdma_ctx->comp_channel) {
+        goto err_alloc_pd_cq;
+    }
+#endif
+
+    /* create cq */
+    rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, QEMU_RDMA_CQ_SIZE,
+            NULL, rdma_ctx->comp_channel, 0);
+    if (!rdma_ctx->cq) {
+        goto err_alloc_pd_cq;
+    }
+
+    return 0;
+
+err_alloc_pd_cq:
+    if (rdma_ctx->pd) {
+        ibv_dealloc_pd(rdma_ctx->pd);
+    }
+    if (rdma_ctx->comp_channel) {
+        ibv_destroy_comp_channel(rdma_ctx->comp_channel);
+    }
+    rdma_ctx->pd = NULL;
+    rdma_ctx->comp_channel = NULL;
+    return -1;
+
+}
+
+int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx)
+{
+    struct ibv_qp_init_attr attr = { 0 };
+    int ret;
+
+    attr.cap.max_send_wr = QEMU_RDMA_QP_SIZE;
+    attr.cap.max_recv_wr = 2;
+    attr.cap.max_send_sge = 1;
+    attr.cap.max_recv_sge = 1;
+    attr.send_cq = rdma_ctx->cq;
+    attr.recv_cq = rdma_ctx->cq;
+    attr.qp_type = IBV_QPT_RC;
+
+    ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr);
+    if (ret) {
+        return -1;
+    }
+
+    rdma_ctx->qp = rdma_ctx->cm_id->qp;
+    return 0;
+}
+
+int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx,
+        void *in_data, int *in_len, void *out_data, int out_len)
+{
+    int ret;
+    struct rdma_conn_param conn_param = { 0 };
+    struct rdma_cm_event *cm_event;
+
+    conn_param.initiator_depth = 2;
+    conn_param.retry_count = 5;
+    conn_param.private_data = out_data;
+    conn_param.private_data_len = out_len;
+
+    ret = rdma_connect(rdma_ctx->cm_id, &conn_param);
+    if (ret) {
+        return -1;
+    }
+
+    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
+    if (ret) {
+        return -1;
+    }
+    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
+        return -1;
+    }
+
+    if (in_len) {
+        if (*in_len > cm_event->param.conn.private_data_len) {
+            *in_len = cm_event->param.conn.private_data_len;
+        }
+        if (*in_len) {
+            memcpy(in_data, cm_event->param.conn.private_data, *in_len);
+        }
+    }
+
+    rdma_ack_cm_event(cm_event);
+
+    return 0;
+}
+
+int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host,
+        int port)
+{
+    int ret;
+    struct rdma_cm_event *cm_event;
+    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
+    struct ibv_context *verbs;
+
+    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
+    if (ret) {
+        goto err_listen;
+    }
+
+    if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
+        rdma_ack_cm_event(cm_event);
+        goto err_listen;
+    }
+
+    rdma_ctx->cm_id = cm_event->id;
+    verbs = cm_event->id->verbs;
+    printf("verbs context after listen: %p\n", verbs);
+    rdma_ack_cm_event(cm_event);
+
+    if (!rdma_ctx->verbs) {
+        rdma_ctx->verbs = verbs;
+        ret = qemu_rdma_migration_server_prepare(mdata);
+        if (ret) {
+            fprintf(stderr, "rdma migration: error preparing server!\n");
+            goto err_listen;
+        }
+    } else if (rdma_ctx->verbs != verbs) {
+            fprintf(stderr, "ibv context not matching %p, %p!\n",
+                    rdma_ctx->verbs, verbs);
+            goto err_listen;
+    }
+    /* xxx destroy listen_id ??? */
+
+    return 0;
+
+err_listen:
+
+    return -1;
+
+}
+
+int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx,
+        void *in_data, int *in_len, void *out_data, int out_len)
+{
+    int ret;
+    struct rdma_conn_param conn_param = { 0 };
+    struct rdma_cm_event *cm_event;
+
+    conn_param.responder_resources = 2;
+    conn_param.private_data = out_data;
+    conn_param.private_data_len = out_len;
+
+    ret = rdma_accept(rdma_ctx->cm_id, &conn_param);
+    if (ret) {
+        fprintf(stderr, "rdma_accept returns %d!\n", ret);
+        return -1;
+    }
+
+    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
+    if (ret) {
+        return -1;
+    }
+
+    if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
+        rdma_ack_cm_event(cm_event);
+        return -1;
+    }
+
+    if (in_len) {
+        if (*in_len > cm_event->param.conn.private_data_len) {
+            *in_len = cm_event->param.conn.private_data_len;
+        }
+        if (*in_len) {
+            memcpy(in_data, cm_event->param.conn.private_data, *in_len);
+        }
+    }
+
+    rdma_ack_cm_event(cm_event);
+
+    return 0;
+}
+
+void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx)
+{
+    int ret;
+    struct rdma_cm_event *cm_event;
+
+    ret = rdma_disconnect(rdma_ctx->cm_id);
+    if (ret) {
+        return;
+    }
+    ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
+    if (ret) {
+        return;
+    }
+    rdma_ack_cm_event(cm_event);
+}
+
+void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx)
+{
+    if (rdma_ctx->qp) {
+        ibv_destroy_qp(rdma_ctx->qp);
+    }
+    if (rdma_ctx->cq) {
+        ibv_destroy_cq(rdma_ctx->cq);
+    }
+    if (rdma_ctx->comp_channel) {
+        ibv_destroy_comp_channel(rdma_ctx->comp_channel);
+    }
+    if (rdma_ctx->pd) {
+        ibv_dealloc_pd(rdma_ctx->pd);
+    }
+    if (rdma_ctx->listen_id) {
+        rdma_destroy_id(rdma_ctx->listen_id);
+    }
+    if (rdma_ctx->cm_id) {
+        rdma_destroy_id(rdma_ctx->cm_id);
+    }
+    if (rdma_ctx->channel) {
+        rdma_destroy_event_channel(rdma_ctx->channel);
+    }
+}
+
+int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_blocks *rdma_ram_blocks)
+{
+    int i, j;
+    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
+        struct qemu_rdma_ram_block *block = &(rdma_ram_blocks->block[i]);
+        int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block);
+        /* allocate memory to store chunk MRs */
+        rdma_ram_blocks->block[i].pmr = qemu_rdma_mallocz(
+                                num_chunks * sizeof(struct ibv_mr *));
+
+        if (!block->pmr) {
+            goto err_reg_chunk_ram_blocks;
+        }
+
+        for (j = 0; j < num_chunks; j++) {
+            uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, j);
+            uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, j);
+            if (start_addr < block->local_host_addr) {
+                start_addr = block->local_host_addr;
+            }
+            if (end_addr > block->local_host_addr + block->length) {
+                end_addr = block->local_host_addr + block->length;
+            }
+            block->pmr[j] = ibv_reg_mr(rdma_ctx->pd,
+                                start_addr,
+                                end_addr - start_addr,
+                                IBV_ACCESS_LOCAL_WRITE |
+                                IBV_ACCESS_REMOTE_WRITE |
+                                IBV_ACCESS_REMOTE_READ);
+            if (!block->pmr[j]) {
+                break;
+            }
+        }
+        if (j < num_chunks) {
+            for (j--; j >= 0; j--) {
+                ibv_dereg_mr(block->pmr[j]);
+            }
+            block->pmr[i] = NULL;
+            goto err_reg_chunk_ram_blocks;
+        }
+    }
+
+    return 0;
+
+err_reg_chunk_ram_blocks:
+    for (i--; i >= 0; i--) {
+        int num_chunks =
+            QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
+        for (j = 0; j < num_chunks; j++) {
+            ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
+        }
+        free(rdma_ram_blocks->block[i].pmr);
+        rdma_ram_blocks->block[i].pmr = NULL;
+    }
+
+    return -1;
+
+}
+
+int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_blocks *rdma_ram_blocks)
+{
+    int i;
+    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
+        rdma_ram_blocks->block[i].mr =
+            ibv_reg_mr(rdma_ctx->pd,
+                    rdma_ram_blocks->block[i].local_host_addr,
+                    rdma_ram_blocks->block[i].length,
+                    IBV_ACCESS_LOCAL_WRITE |
+                    IBV_ACCESS_REMOTE_WRITE |
+                    IBV_ACCESS_REMOTE_READ);
+        if (!rdma_ram_blocks->block[i].mr) {
+                break;
+        }
+    }
+
+    if (i >= rdma_ram_blocks->num_blocks) {
+        return 0;
+    }
+
+    for (i--; i >= 0; i--) {
+        ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
+    }
+
+    return -1;
+
+}
+
+int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+                struct qemu_rdma_ram_blocks *rdma_ram_blocks)
+{
+#ifdef QEMU_RDMA_CHUNK_REGISTRATION
+#ifdef QEMU_RDMA_LAZY_REGISTRATION
+    return 0;
+#else
+    return qemu_rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks);
+#endif
+#else
+    return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
+#endif
+}
+
+int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx,
+                struct qemu_rdma_ram_blocks *rdma_ram_blocks)
+{
+    return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
+}
+
+void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks)
+{
+    int i, j;
+    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
+        int num_chunks;
+        if (!rdma_ram_blocks->block[i].pmr) {
+            continue;
+        }
+        num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
+        for (j = 0; j < num_chunks; j++) {
+            if (!rdma_ram_blocks->block[i].pmr[j]) {
+                continue;
+            }
+            ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
+        }
+        free(rdma_ram_blocks->block[i].pmr);
+        rdma_ram_blocks->block[i].pmr = NULL;
+    }
+    for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
+        if (!rdma_ram_blocks->block[i].mr) {
+            continue;
+        }
+        ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
+        rdma_ram_blocks->block[i].mr = NULL;
+    }
+}
+
+void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
+                struct qemu_rdma_remote_ram_blocks *remote)
+{
+    int i;
+    remote->num_blocks = local->num_blocks;
+    for (i = 0; i < local->num_blocks; i++) {
+            remote->block[i].remote_host_addr =
+                (uint64_t)(local->block[i].local_host_addr);
+            remote->block[i].remote_rkey = local->block[i].mr->rkey;
+            remote->block[i].offset = local->block[i].offset;
+            remote->block[i].length = local->block[i].length;
+    }
+}
+
+int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local,
+                struct qemu_rdma_remote_ram_blocks *remote)
+{
+    int i, j;
+
+    if (local->num_blocks != remote->num_blocks) {
+        return -1;
+    }
+
+    for (i = 0; i < remote->num_blocks; i++) {
+        /* search local ram blocks */
+        for (j = 0; j < local->num_blocks; j++) {
+            if (remote->block[i].offset != local->block[j].offset) {
+                continue;
+            }
+            if (remote->block[i].length != local->block[j].length) {
+                return -1;
+            }
+            local->block[j].remote_host_addr =
+                remote->block[i].remote_host_addr;
+            local->block[j].remote_rkey = remote->block[i].remote_rkey;
+            break;
+        }
+        if (j >= local->num_blocks) {
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length,
+                struct qemu_rdma_ram_blocks *blocks,
+                int *block_index, int *chunk_index)
+{
+    int i;
+    for (i = 0; i < blocks->num_blocks; i++) {
+        if (offset < blocks->block[i].offset) {
+            continue;
+        }
+        if (offset + length >
+                blocks->block[i].offset + blocks->block[i].length) {
+            continue;
+        }
+        *block_index = i;
+        if (chunk_index) {
+            uint8_t *host_addr = blocks->block[i].local_host_addr +
+                (offset - blocks->block[i].offset);
+            *chunk_index = QEMU_RDMA_REG_CHUNK_INDEX(
+                    blocks->block[i].local_host_addr, host_addr);
+        }
+        return 0;
+    }
+    return -1;
+}
+
+static int qemu_rdma_get_lkey(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_block *block, uint64_t host_addr,
+        uint32_t *lkey)
+{
+    int chunk;
+    if (block->mr) {
+        *lkey = block->mr->lkey;
+        return 0;
+    }
+    if (!block->pmr) {
+        int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block);
+        /* allocate memory to store chunk MRs */
+        block->pmr = qemu_rdma_mallocz(num_chunks *
+                sizeof(struct ibv_mr *));
+        if (!block->pmr) {
+            return -1;
+        }
+    }
+    chunk = QEMU_RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr);
+    if (!block->pmr[chunk]) {
+        uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, chunk);
+        uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, chunk);
+        if (start_addr < block->local_host_addr) {
+            start_addr = block->local_host_addr;
+        }
+        if (end_addr > block->local_host_addr + block->length) {
+            end_addr = block->local_host_addr + block->length;
+        }
+        block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd,
+                start_addr,
+                end_addr - start_addr,
+                IBV_ACCESS_LOCAL_WRITE |
+                IBV_ACCESS_REMOTE_WRITE |
+                IBV_ACCESS_REMOTE_READ);
+        if (!block->pmr[chunk]) {
+            return -1;
+        }
+    }
+    *lkey = block->pmr[chunk]->lkey;
+    return 0;
+}
+
+static int qemu_rdma_write(struct qemu_rdma_context *rdma_ctx,
+        struct qemu_rdma_ram_block *block,
+        uint64_t offset, uint64_t length,
+        uint64_t wr_id, enum ibv_send_flags flag)
+{
+    struct ibv_sge sge;
+    struct ibv_send_wr send_wr = { 0 };
+    struct ibv_send_wr *bad_wr;
+
+    sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset));
+    sge.length = length;
+    if (qemu_rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) {
+        fprintf(stderr, "cannot get lkey!\n");
+        return -1;
+    }
+    send_wr.wr_id = wr_id;
+    send_wr.opcode = IBV_WR_RDMA_WRITE;
+    send_wr.send_flags = flag;
+    send_wr.sg_list = &sge;
+    send_wr.num_sge = 1;
+    send_wr.wr.rdma.rkey = block->remote_rkey;
+    send_wr.wr.rdma.remote_addr = block->remote_host_addr +
+        (offset - block->offset);
+
+    if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/* Do not merge data if larger than this. */
+#define QEMU_RDMA_MIGRATION_MERGE_MAX (4 * 1024 * 1024)
+
+#define QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX 64
+
+int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata)
+{
+    mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
+            &mdata->sync,
+            sizeof mdata->sync,
+            IBV_ACCESS_LOCAL_WRITE |
+            IBV_ACCESS_REMOTE_WRITE |
+            IBV_ACCESS_REMOTE_READ);
+    if (mdata->sync_mr) {
+        return 0;
+    }
+    return -1;
+}
+
+int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata)
+{
+    return ibv_dereg_mr(mdata->sync_mr);
+}
+
+int qemu_rdma_migration_reg_remote_info(
+        struct qemu_rdma_migration_data *mdata)
+{
+    mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
+            &mdata->remote_info,
+            sizeof mdata->remote_info,
+            IBV_ACCESS_LOCAL_WRITE |
+            IBV_ACCESS_REMOTE_WRITE |
+            IBV_ACCESS_REMOTE_READ);
+    if (mdata->remote_info_mr) {
+        return 0;
+    }
+    return -1;
+}
+
+int qemu_rdma_migration_dereg_remote_info(
+        struct qemu_rdma_migration_data *mdata)
+{
+    return ibv_dereg_mr(mdata->remote_info_mr);
+}
+
+
+int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata,
+        int wr_id)
+{
+    struct ibv_sge sge;
+    struct ibv_send_wr send_wr = { 0 };
+    struct ibv_send_wr *bad_wr;
+
+    mdata->sync = 1;
+
+    sge.addr = (uint64_t)(&mdata->sync);
+    sge.length = sizeof mdata->sync;
+    sge.lkey = mdata->sync_mr->lkey;
+
+    send_wr.wr_id = wr_id;
+    send_wr.opcode = IBV_WR_SEND;
+    send_wr.send_flags = IBV_SEND_SIGNALED;
+    send_wr.sg_list = &sge;
+    send_wr.num_sge = 1;
+
+    if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata,
+        int wr_id)
+{
+    struct ibv_sge sge;
+    struct ibv_recv_wr recv_wr = { 0 };
+    struct ibv_recv_wr *bad_wr;
+
+    mdata->sync = 1;
+
+    sge.addr = (uint64_t)(&mdata->sync);
+    sge.length = sizeof mdata->sync;
+    sge.lkey = mdata->sync_mr->lkey;
+
+    recv_wr.wr_id = wr_id;
+    recv_wr.sg_list = &sge;
+    recv_wr.num_sge = 1;
+
+    if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
+        return -1;
+    }
+
+    return 0;
+
+}
+
+int qemu_rdma_migration_post_send_remote_info(
+        struct qemu_rdma_migration_data *mdata)
+{
+    struct ibv_sge sge;
+    struct ibv_send_wr send_wr = { 0 };
+    struct ibv_send_wr *bad_wr;
+
+    sge.addr = (uint64_t)(&mdata->remote_info);
+    sge.length = sizeof mdata->remote_info;
+    sge.lkey = mdata->remote_info_mr->lkey;
+
+    send_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO;
+    send_wr.opcode = IBV_WR_SEND;
+    send_wr.send_flags = IBV_SEND_SIGNALED;
+    send_wr.sg_list = &sge;
+    send_wr.num_sge = 1;
+
+    if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
+        return -1;
+    }
+
+    mdata->num_signaled_send--;
+    return 0;
+}
+
+int qemu_rdma_migration_post_recv_remote_info(
+        struct qemu_rdma_migration_data *mdata)
+{
+    struct ibv_sge sge;
+    struct ibv_recv_wr recv_wr = { 0 };
+    struct ibv_recv_wr *bad_wr;
+
+    sge.addr = (uint64_t)(&mdata->remote_info);
+    sge.length = sizeof mdata->remote_info;
+    sge.lkey = mdata->remote_info_mr->lkey;
+
+    recv_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO;
+    recv_wr.sg_list = &sge;
+    recv_wr.num_sge = 1;
+
+    if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+
+int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata)
+{
+    int ret;
+    enum ibv_send_flags flags = 0;
+
+    if (!mdata->current_length) {
+        return 0;
+    }
+    if (mdata->num_unsignaled_send >=
+            QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) {
+        flags = IBV_SEND_SIGNALED;
+    }
+    ret = qemu_rdma_write(&mdata->rdma_ctx,
+            &(mdata->rdma_ram_blocks.block[mdata->current_index]),
+            mdata->current_offset,
+            mdata->current_length,
+            QEMU_RDMA_MIGRATION_WRID_RDMA, flags);
+
+    if (ret) {
+        return ret;
+    }
+
+    if (mdata->num_unsignaled_send >=
+            QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) {
+        mdata->num_unsignaled_send = 0;
+        mdata->num_signaled_send++;
+    } else {
+        mdata->num_unsignaled_send++;
+    }
+
+    mdata->total_bytes += mdata->current_length;
+    mdata->current_length = 0;
+    mdata->current_offset = 0;
+
+    return 0;
+}
+
+static inline int qemu_rdma_migration_in_current_block(
+        struct qemu_rdma_migration_data *mdata,
+        uint64_t offset, uint64_t len)
+{
+    struct qemu_rdma_ram_block *block =
+        &(mdata->rdma_ram_blocks.block[mdata->current_index]);
+    if (mdata->current_index < 0) {
+        return 0;
+    }
+    if (offset < block->offset) {
+        return 0;
+    }
+    if (offset + len > block->offset + block->length) {
+        return 0;
+    }
+    return 1;
+}
+
+static inline int qemu_rdma_migration_in_current_chunk(
+        struct qemu_rdma_migration_data *mdata,
+        uint64_t offset, uint64_t len)
+{
+    struct qemu_rdma_ram_block *block =
+        &(mdata->rdma_ram_blocks.block[mdata->current_index]);
+    uint8_t *chunk_start, *chunk_end, *host_addr;
+    if (mdata->current_chunk < 0) {
+        return 0;
+    }
+    host_addr = block->local_host_addr + (offset - block->offset);
+    chunk_start = QEMU_RDMA_REG_CHUNK_START(block, mdata->current_chunk);
+    if (chunk_start < block->local_host_addr) {
+        chunk_start = block->local_host_addr;
+    }
+    if (host_addr < chunk_start) {
+        return 0;
+    }
+    chunk_end = QEMU_RDMA_REG_CHUNK_END(block, mdata->current_chunk);
+    if (chunk_end > chunk_start + block->length) {
+        chunk_end = chunk_start + block->length;
+    }
+    if (host_addr + len > chunk_end) {
+        return 0;
+    }
+    return 1;
+}
+
+static inline int qemu_rdma_buffer_mergable(
+        struct qemu_rdma_migration_data *mdata,
+        uint64_t offset, uint64_t len)
+{
+    if (mdata->current_length == 0) {
+        return 0;
+    }
+    if (offset != mdata->current_offset + mdata->current_length) {
+        return 0;
+    }
+    if (!qemu_rdma_migration_in_current_block(mdata, offset, len)) {
+        return 0;
+    }
+#ifdef QEMU_RDMA_CHUNK_REGISTRATION
+    if (!qemu_rdma_migration_in_current_chunk(mdata, offset, len)) {
+        return 0;
+    }
+#endif
+    return 1;
+}
+
+/* Note that buffer must be within a single block/chunk. */
+int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata,
+                uint64_t offset, uint64_t len)
+{
+    int index = mdata->current_index;
+    int chunk_index = mdata->current_chunk;
+    int ret;
+
+    /* If we cannot merge it, we flush the current buffer first. */
+    if (!qemu_rdma_buffer_mergable(mdata, offset, len)) {
+        ret = qemu_rdma_migration_write_flush(mdata);
+        if (ret) {
+            return ret;
+        }
+        mdata->current_length = 0;
+        mdata->current_offset = offset;
+
+        if (qemu_rdma_search_ram_block(offset, len,
+                    &mdata->rdma_ram_blocks, &index, &chunk_index)) {
+            return -1;
+        }
+        mdata->current_index = index;
+        mdata->current_chunk = chunk_index;
+    }
+
+    /* merge it */
+    mdata->current_length += len;
+
+    /* flush it if buffer is too large */
+    if (mdata->current_length >= QEMU_RDMA_MIGRATION_MERGE_MAX) {
+        return qemu_rdma_migration_write_flush(mdata);
+    }
+
+    return 0;
+}
+
+int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata)
+{
+    int ret;
+    struct ibv_wc wc;
+
+    ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc);
+    if (!ret) {
+        return QEMU_RDMA_MIGRATION_WRID_NONE;
+    }
+    if (ret < 0) {
+        fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
+        return ret;
+    }
+    if (wc.status != IBV_WC_SUCCESS) {
+        fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
+                        wc.status, ibv_wc_status_str(wc.status));
+        fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id);
+
+        return -1;
+    }
+
+    if (!(wc.opcode & IBV_WC_RECV)) {
+        mdata->num_signaled_send--;
+    }
+
+    return  (int)wc.wr_id;
+}
+
+int qemu_rdma_migration_wait_for_wrid(
+        struct qemu_rdma_migration_data *mdata,
+        int wrid)
+{
+#ifdef QEMU_RDMA_MIGRATION_BLOCKING
+    return qemu_rdma_migration_block_for_wrid(mdata, wrid);
+#else
+    return qemu_rdma_migration_poll_for_wrid(mdata, wrid);
+#endif
+}
+
+int qemu_rdma_migration_poll_for_wrid(
+        struct qemu_rdma_migration_data *mdata,
+        int wrid)
+{
+    int r = QEMU_RDMA_MIGRATION_WRID_NONE;
+    while (r != wrid) {
+        r = qemu_rdma_migration_poll(mdata);
+        if (r < 0) {
+            return r;
+        }
+    }
+    return 0;
+}
+
+int qemu_rdma_migration_block_for_wrid(
+        struct qemu_rdma_migration_data *mdata,
+        int wrid)
+{
+    int num_cq_events = 0;
+    int r = QEMU_RDMA_MIGRATION_WRID_NONE;
+    struct ibv_cq *cq;
+    void *cq_ctx;
+
+    if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) {
+        return -1;
+    }
+    /* poll cq first */
+    while (r != wrid) {
+        r = qemu_rdma_migration_poll(mdata);
+        if (r < 0) {
+            return r;
+        }
+        if (r == QEMU_RDMA_MIGRATION_WRID_NONE) {
+            break;
+        }
+    }
+    if (r == wrid) {
+        return 0;
+    }
+
+    while (1) {
+        if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel,
+                    &cq, &cq_ctx)) {
+            goto err_block_for_wrid;
+        }
+        num_cq_events++;
+        if (ibv_req_notify_cq(cq, 0)) {
+            goto err_block_for_wrid;
+        }
+        /* poll cq */
+        while (r != wrid) {
+            r = qemu_rdma_migration_poll(mdata);
+            if (r < 0) {
+                goto err_block_for_wrid;
+            }
+            if (r == QEMU_RDMA_MIGRATION_WRID_NONE) {
+                break;
+            }
+        }
+        if (r == wrid) {
+            goto success_block_for_wrid;
+        }
+    }
+
+success_block_for_wrid:
+    if (num_cq_events) {
+        ibv_ack_cq_events(cq, num_cq_events);
+    }
+    return 0;
+
+err_block_for_wrid:
+    if (num_cq_events) {
+        ibv_ack_cq_events(cq, num_cq_events);
+    }
+    return -1;
+}
+
+void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata)
+{
+    mdata->enabled = 0;
+    if (mdata->sync_mr) {
+        qemu_rdma_migration_dereg_sync(mdata);
+    }
+    if (mdata->remote_info_mr) {
+        qemu_rdma_migration_dereg_remote_info(mdata);
+    }
+    mdata->sync_mr = NULL;
+    mdata->remote_info_mr = NULL;
+    qemu_rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks);
+    mdata->rdma_ram_blocks.num_blocks = 0;
+    qemu_rdma_cleanup(&mdata->rdma_ctx);
+    qemu_rdma_migration_data_init(mdata);
+}
+
+int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata)
+{
+    int ret;
+
+    if (mdata->client_init_done) {
+        return 0;
+    }
+
+    ret = qemu_rdma_resolve_host(&mdata->rdma_ctx,
+            mdata->host, mdata->port);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error resolving host!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_alloc_pd_cq(&mdata->rdma_ctx);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error allocating qp!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_client_reg_ram_blocks(&mdata->rdma_ctx,
+            &mdata->rdma_ram_blocks);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error registering ram blocks!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_migration_reg_sync(mdata);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error registering sync data!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_migration_reg_remote_info(mdata);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error registering remote info!\n");
+        goto err_rdma_client_init;
+    }
+
+    ret = qemu_rdma_migration_post_recv_remote_info(mdata);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error posting remote info recv!\n");
+        goto err_rdma_client_init;
+    }
+
+    mdata->client_init_done = 1;
+    return 0;
+
+err_rdma_client_init:
+    qemu_rdma_migration_cleanup(mdata);
+    return -1;
+}
+
+
+int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata)
+{
+
+    int ret;
+
+    ret = qemu_rdma_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error connecting!\n");
+        goto err_rdma_client_connect;
+    }
+
+    /* wait for remote info */
+    ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+            QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO);
+    if (ret < 0) {
+        fprintf(stderr, "rdma migration: polling remote info error!\n");
+        goto err_rdma_client_connect;
+    }
+
+    ret = qemu_rdma_process_remote_ram_blocks(
+            &mdata->rdma_ram_blocks, &mdata->remote_info);
+    if (ret) {
+        fprintf(stderr,
+                "rdma migration: error processing remote ram blocks!\n");
+        goto err_rdma_client_connect;
+    }
+
+    rdma_mdata.total_bytes = 0;
+    rdma_mdata.enabled = 1;
+    return 0;
+
+err_rdma_client_connect:
+    qemu_rdma_migration_cleanup(mdata);
+    return -1;
+}
+
+int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata)
+{
+
+    int ret;
+    struct sockaddr_in sin;
+    struct rdma_cm_id *listen_id;
+    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
+
+    /* create CM channel */
+    rdma_ctx->channel = rdma_create_event_channel();
+    if (!rdma_ctx->channel) {
+        return -1;
+    }
+
+    /* create CM id */
+    ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL,
+            RDMA_PS_TCP);
+    if (ret) {
+        goto err_server_init_create_listen_id;
+    }
+
+    memset(&sin, 0, sizeof(sin));
+    sin.sin_family = AF_INET;
+    sin.sin_port = htons(mdata->port);
+    if (strcmp("", mdata->host)) {
+        struct hostent *server_addr;
+        server_addr = gethostbyname(mdata->host);
+        if (!server_addr) {
+            goto err_server_init_bind_addr;
+        }
+        memcpy(&sin.sin_addr.s_addr, server_addr->h_addr,
+                server_addr->h_length);
+    } else {
+        sin.sin_addr.s_addr = INADDR_ANY;
+    }
+
+    ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin);
+    if (ret) {
+        goto err_server_init_bind_addr;
+    }
+    printf("verbs context after binding: %p\n", listen_id->verbs);
+
+    rdma_ctx->listen_id = listen_id;
+    if (listen_id->verbs) {
+        rdma_ctx->verbs = listen_id->verbs;
+    }
+    return 0;
+
+err_server_init_bind_addr:
+    rdma_destroy_id(listen_id);
+err_server_init_create_listen_id:
+    rdma_destroy_event_channel(rdma_ctx->channel);
+    rdma_ctx->channel = NULL;
+
+    return -1;
+
+}
+
+int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata)
+{
+    int ret;
+    struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx;
+
+    if (!rdma_ctx->verbs) {
+        return 0;
+    }
+
+    ret = qemu_rdma_alloc_pd_cq(rdma_ctx);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
+        goto err_rdma_server_prepare;
+    }
+
+    ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
+        goto err_rdma_server_prepare;
+    }
+
+    ret = qemu_rdma_server_reg_ram_blocks(rdma_ctx,
+            &mdata->rdma_ram_blocks);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error registering ram blocks!\n");
+        goto err_rdma_server_prepare;
+    }
+
+    ret = qemu_rdma_migration_reg_sync(mdata);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error registering sync data!\n");
+        goto err_rdma_server_prepare;
+    }
+
+    qemu_rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks,
+            &mdata->remote_info);
+
+    ret = qemu_rdma_migration_reg_remote_info(mdata);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error registering remote info!\n");
+        goto err_rdma_server_prepare;
+    }
+
+    ret = rdma_listen(rdma_ctx->listen_id, 5);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error listening on socket!\n");
+        goto err_rdma_server_prepare;
+    }
+
+    return 0;
+
+err_rdma_server_prepare:
+    qemu_rdma_migration_cleanup(mdata);
+    return -1;
+}
+
+int qemu_rdma_migration_server_wait_for_client(
+                                    struct qemu_rdma_migration_data *mdata)
+{
+
+    int ret;
+
+    ret = qemu_rdma_listen(mdata, mdata->host, mdata->port);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error listening!\n");
+        goto err_rdma_server_wait;
+    }
+
+    ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error allocating qp!\n");
+        goto err_rdma_server_wait;
+    }
+
+#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
+    ret = qemu_rdma_migration_post_recv_sync(mdata,
+            QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error posting extra sync receive!\n");
+        goto err_rdma_server_wait;
+    }
+#endif
+
+    ret = qemu_rdma_migration_post_recv_sync(mdata,
+            QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error posting sync receive!\n");
+        goto err_rdma_server_wait;
+    }
+
+    ret = qemu_rdma_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error accepting connection!\n");
+        goto err_rdma_server_wait;
+    }
+
+    /* send remote info */
+    ret = qemu_rdma_migration_post_send_remote_info(mdata);
+    if (ret) {
+        fprintf(stderr, "rdma migration: error sending remote info!\n");
+        goto err_rdma_server_wait;
+    }
+
+    /* wait for completion */
+    ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+            QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO);
+    if (ret < 0) {
+        fprintf(stderr, "rdma migration: polling remote info error!\n");
+        goto err_rdma_server_wait;
+    }
+
+    rdma_mdata.total_bytes = 0;
+    rdma_mdata.enabled = 1;
+    return 0;
+
+err_rdma_server_wait:
+    qemu_rdma_migration_cleanup(mdata);
+    return -1;
+
+}
+
+void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata)
+{
+    qemu_rdma_init_context(&mdata->rdma_ctx);
+    mdata->port = rdmaport;
+    mdata->host = rdmahost;
+    mdata->enabled = 0;
+    mdata->rdma_ram_blocks.num_blocks = 0;
+    mdata->client_init_done = 0;
+    mdata->num_unsignaled_send = 0;
+    mdata->num_signaled_send = 0;
+    mdata->current_offset = 0;
+    mdata->current_length = 0;
+    mdata->current_index = -1;
+    mdata->current_chunk = -1;
+    mdata->sync = 0;
+    mdata->sync_mr = NULL;
+    mdata->remote_info_mr = NULL;
+}
+
+void qemu_rdma_migration_disable(void)
+{
+    rdma_mdata.port = -1;
+    rdma_mdata.enabled = 0;
+}