Patchwork [RFC,RDMA,support,v2:,6/6] send memory over RDMA as blocks are iterated

login
register
mail settings
Submitter mrhines@linux.vnet.ibm.com
Date Feb. 11, 2013, 10:49 p.m.
Message ID <1360622997-26904-6-git-send-email-mrhines@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/219698/
State New
Headers show

Comments

mrhines@linux.vnet.ibm.com - Feb. 11, 2013, 10:49 p.m.
From: "Michael R. Hines" <mrhines@us.ibm.com>


Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 arch_init.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 savevm.c    |   59 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 139 insertions(+), 4 deletions(-)
Orit Wasserman - Feb. 13, 2013, 9:50 a.m.
On 02/12/2013 12:49 AM, Michael R. Hines wrote:
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  arch_init.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  savevm.c    |   59 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 139 insertions(+), 4 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index dada6de..76092cc 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -42,6 +42,7 @@
>  #include "migration/migration.h"
>  #include "exec/gdbstub.h"
>  #include "hw/smbios.h"
> +#include "qemu/rdma.h"
>  #include "exec/address-spaces.h"
>  #include "hw/pcspk.h"
>  #include "migration/page_cache.h"
> @@ -170,6 +171,15 @@ static int is_dup_page(uint8_t *page)
>      VECTYPE val = SPLAT(page);
>      int i;
>  
> +    /*
> +     * RFC RDMA: The empirical cost of searching for zero pages here
> +     *           plus the cost of communicating with the other side
> +     *           seems to take significantly more time than simply
> +     *           dumping the page into remote memory.
> +     */
> +    if (migrate_rdma_enabled())
> +        return 0;
> +
I would change ram_save_block not this function.
So I would move the if to before calling is_dup.

>      for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
>          if (!ALL_EQ(val, p[i])) {
>              return 0;
> @@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
>      return size;
>  }
>  
> +static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
> +                             int cont)
> +{
> +    size_t bytes_sent = 0;
> +    ram_addr_t current_addr;
> +
> +    acct_info.norm_pages++;
> +
> +    /*
> +     * use RDMA to send page
> +     */
> +    current_addr = block->offset + offset;
> +    if (rdma_write(&rdma_mdata, current_addr,
> +            TARGET_PAGE_SIZE)) {
> +        fprintf(stderr, "rdma migration: write error!\n");
> +        qemu_file_set_error(f, -EIO);
we lose the error here,
why not let rdma_write to set the error, let it get the QemuFile as a parameter. 
> +        return 0;
> +    }
> +
> +    /*
> +     * do some polling
> +     */
> +    while (1) {
> +        int ret = rdma_poll(&rdma_mdata);
> +        if (ret == RDMA_WRID_NONE) {
> +            break;
> +        }
> +        if (ret < 0) {
> +            fprintf(stderr, "rdma migration: polling error!\n");
> +            qemu_file_set_error(f, -EIO);
see above
> +            return 0;
> +        }
> +    }
> +
> +    bytes_sent += TARGET_PAGE_SIZE;
> +    return bytes_sent;
> +}
> +
>  #define ENCODING_FLAG_XBZRLE 0x1
>  
>  static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
> @@ -474,6 +522,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>                  if (!last_stage) {
>                      p = get_cached_data(XBZRLE.cache, current_addr);
>                  }
> +            } else if (migrate_rdma_enabled()) {
> +                bytes_sent = save_rdma_page(f, block, offset, cont);
>              }
You can move it to the top and than you won't need to change is_dup.
>  
>              /* XBZRLE overflow or normal page */
> @@ -601,12 +651,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      return 0;
>  }
>  
> +static int tprate = 1000;
> +
>  static int ram_save_iterate(QEMUFile *f, void *opaque)
>  {
>      int ret;
>      int i;
> -    int64_t t0;
> -    int total_sent = 0;
> +    int64_t t0, tp0;
> +    int total_sent = 0, last_total_sent = 0;
>  
>      qemu_mutex_lock_ramlist();
>  
> @@ -625,23 +677,49 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>              break;
>          }
>          total_sent += bytes_sent;
> +        last_total_sent += bytes_sent;
>          acct_info.iterations++;
>          /* we want to check in the 1st loop, just in case it was the 1st time
>             and we had to sync the dirty bitmap.
>             qemu_get_clock_ns() is a bit expensive, so we only check each some
>             iterations
>          */
> +
> +        /*
> +         * RFC RDMA: Can we have something like this to periodically print
> +         *           out throughput? This is just a rough-sketch that 
> +         *           partially worked for me. I assume there a better way 
> +         *           that everyone would prefer. Perhaps we could set a QMP 
> +         *           command that toggled a "periodic printing" option that 
> +         *           allowed more details to be printed on stdout.....?
> +         */
>          if ((i & 63) == 0) {
> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
> +            uint64_t t1 = (curr - t0) / 1000000;
> +            double tp;
>              if (t1 > MAX_WAIT) {
>                  DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>                          t1, i);
>                  break;
>              }
> +
> +            if ((i % tprate) == 0) {
> +                uint64_t tp1 = (curr - tp0) / 1000000;
> +                tp = ((double) last_total_sent * 8.0 /
> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
> +                printf("throughput: %f mbps\n", tp);
DPRINTF please ...
> +                last_total_sent = 0;
> +                tp0 = curr;
> +            }
>          }
>          i++;
>      }
>  
> +    if (migrate_rdma_enabled() && rdma_write_flush(&rdma_mdata) < 0) {
> +        qemu_file_set_error(f, -EIO);
see above ...
> +        return 0;
> +    }
> +
>      qemu_mutex_unlock_ramlist();
>  
>      if (ret < 0) {
> diff --git a/savevm.c b/savevm.c
> index 304d1ef..4d0bef3 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -24,6 +24,7 @@
>  
>  #include "config-host.h"
>  #include "qemu-common.h"
> +#include "qemu/rdma.h"
>  #include "hw/hw.h"
>  #include "hw/qdev.h"
>  #include "net/net.h"
> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>      return f->last_error;
>  }
>  
> -static void qemu_file_set_error(QEMUFile *f, int ret)
> +void qemu_file_set_error(QEMUFile *f, int ret)
>  {
>      if (f->last_error == 0) {
>          f->last_error = ret;
> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>  {
>      SaveStateEntry *se;
>      int ret = 1;
> +    static int first_time = 1;
>  
>      QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>          if (!se->ops || !se->ops->save_live_iterate) {
> @@ -1643,6 +1645,30 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>          }
>      }
>      if (ret != 0) {
> +#ifdef RDMA_EXTRA_SYNC
> +        /*
> +         * We use two "sync" infiniband messages happen during migration.
> +         * One at the beginning and one at the end, just to be thorough.
> +         * This is the first one.
> +         */
> +        if (first_time && migrate_rdma_enabled()) {
Can't this code move to qemu_savevm_state_begin or even better to ram_save_setup  ..
> +            int r;
> +            first_time = 0;
> +            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
> +                fprintf(stderr,
> +                        "rdma migration: error posting extra send sync!\n");
> +                return -EIO;
Why not get the real error, please try to give more details on
the actual error to allow the user to understand why migration failed.
Also please update QemuFile error too.
> +            }
> +
> +            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
> +            if (r < 0) {
> +                fprintf(stderr,
> +                        "rdma migration: qemu_savevm_state_iterate"
> +                        " sync polling error!\n");
> +                return -EIO;
same here ...
> +            }
> +        }
> +#endif
>          return ret;
>      }
>      ret = qemu_file_get_error(f);
> @@ -1703,8 +1729,30 @@ int qemu_savevm_state_complete(QEMUFile *f)
>          trace_savevm_section_end(se->section_id);
>      }
>  
> +    /*
> +     * We use two "sync" infiniband messages happen during migration.
> +     * One at the beginning and one at the end, just to be thorough.
> +     * This is the second one.
> +     */
> +    if (migrate_rdma_enabled()) {
> +        if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_SYNC)) {
> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
> +            return -EIO;
can't this moved to ram_save_complete?
Regards,
Orit
> +        }
> +    }
> +
>      qemu_put_byte(f, QEMU_VM_EOF);
>  
> +    /* wait for RDMA sync message to complete */
> +    if (migrate_rdma_enabled()) {
> +        int ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_SYNC);
> +        if (ret < 0) {
> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
> +                            " sync polling error!\n");
> +            return -EIO;
...
> +        }
> +     }
> +
>      return qemu_file_get_error(f);
>  }
>  
> @@ -2014,6 +2062,15 @@ int qemu_loadvm_state(QEMUFile *f)
>  
>      cpu_synchronize_all_post_init();
>  
> +    /* wait for RDMA sync message */
> +    if (migrate_rdma_enabled()) {
> +        ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_RECV_SYNC);
> +        if (ret < 0) {
> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
> +                            " sync polling error!\n");
> +            goto out;
> +        }
> +    }
>      ret = 0;
>  
>  out:
>
mrhines@linux.vnet.ibm.com - Feb. 13, 2013, 1:49 p.m.
Very helpful, thank you. I'll implement these and all the other ones soon.

- Michael

On 02/13/2013 04:50 AM, Orit Wasserman wrote:
> On 02/12/2013 12:49 AM, Michael R. Hines wrote:
>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>
>>
>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>> ---
>>   arch_init.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>   savevm.c    |   59 ++++++++++++++++++++++++++++++++++++++++-
>>   2 files changed, 139 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch_init.c b/arch_init.c
>> index dada6de..76092cc 100644
>> --- a/arch_init.c
>> +++ b/arch_init.c
>> @@ -42,6 +42,7 @@
>>   #include "migration/migration.h"
>>   #include "exec/gdbstub.h"
>>   #include "hw/smbios.h"
>> +#include "qemu/rdma.h"
>>   #include "exec/address-spaces.h"
>>   #include "hw/pcspk.h"
>>   #include "migration/page_cache.h"
>> @@ -170,6 +171,15 @@ static int is_dup_page(uint8_t *page)
>>       VECTYPE val = SPLAT(page);
>>       int i;
>>   
>> +    /*
>> +     * RFC RDMA: The empirical cost of searching for zero pages here
>> +     *           plus the cost of communicating with the other side
>> +     *           seems to take significantly more time than simply
>> +     *           dumping the page into remote memory.
>> +     */
>> +    if (migrate_rdma_enabled())
>> +        return 0;
>> +
> I would change ram_save_block not this function.
> So I would move the if to before calling is_dup.
>
>>       for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
>>           if (!ALL_EQ(val, p[i])) {
>>               return 0;
>> @@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
>>       return size;
>>   }
>>   
>> +static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
>> +                             int cont)
>> +{
>> +    size_t bytes_sent = 0;
>> +    ram_addr_t current_addr;
>> +
>> +    acct_info.norm_pages++;
>> +
>> +    /*
>> +     * use RDMA to send page
>> +     */
>> +    current_addr = block->offset + offset;
>> +    if (rdma_write(&rdma_mdata, current_addr,
>> +            TARGET_PAGE_SIZE)) {
>> +        fprintf(stderr, "rdma migration: write error!\n");
>> +        qemu_file_set_error(f, -EIO);
> we lose the error here,
> why not let rdma_write to set the error, let it get the QemuFile as a parameter.
>> +        return 0;
>> +    }
>> +
>> +    /*
>> +     * do some polling
>> +     */
>> +    while (1) {
>> +        int ret = rdma_poll(&rdma_mdata);
>> +        if (ret == RDMA_WRID_NONE) {
>> +            break;
>> +        }
>> +        if (ret < 0) {
>> +            fprintf(stderr, "rdma migration: polling error!\n");
>> +            qemu_file_set_error(f, -EIO);
> see above
>> +            return 0;
>> +        }
>> +    }
>> +
>> +    bytes_sent += TARGET_PAGE_SIZE;
>> +    return bytes_sent;
>> +}
>> +
>>   #define ENCODING_FLAG_XBZRLE 0x1
>>   
>>   static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
>> @@ -474,6 +522,8 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>                   if (!last_stage) {
>>                       p = get_cached_data(XBZRLE.cache, current_addr);
>>                   }
>> +            } else if (migrate_rdma_enabled()) {
>> +                bytes_sent = save_rdma_page(f, block, offset, cont);
>>               }
> You can move it to the top and than you won't need to change is_dup.
>>   
>>               /* XBZRLE overflow or normal page */
>> @@ -601,12 +651,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>       return 0;
>>   }
>>   
>> +static int tprate = 1000;
>> +
>>   static int ram_save_iterate(QEMUFile *f, void *opaque)
>>   {
>>       int ret;
>>       int i;
>> -    int64_t t0;
>> -    int total_sent = 0;
>> +    int64_t t0, tp0;
>> +    int total_sent = 0, last_total_sent = 0;
>>   
>>       qemu_mutex_lock_ramlist();
>>   
>> @@ -625,23 +677,49 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>               break;
>>           }
>>           total_sent += bytes_sent;
>> +        last_total_sent += bytes_sent;
>>           acct_info.iterations++;
>>           /* we want to check in the 1st loop, just in case it was the 1st time
>>              and we had to sync the dirty bitmap.
>>              qemu_get_clock_ns() is a bit expensive, so we only check each some
>>              iterations
>>           */
>> +
>> +        /*
>> +         * RFC RDMA: Can we have something like this to periodically print
>> +         *           out throughput? This is just a rough-sketch that
>> +         *           partially worked for me. I assume there a better way
>> +         *           that everyone would prefer. Perhaps we could set a QMP
>> +         *           command that toggled a "periodic printing" option that
>> +         *           allowed more details to be printed on stdout.....?
>> +         */
>>           if ((i & 63) == 0) {
>> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
>> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
>> +            uint64_t t1 = (curr - t0) / 1000000;
>> +            double tp;
>>               if (t1 > MAX_WAIT) {
>>                   DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>>                           t1, i);
>>                   break;
>>               }
>> +
>> +            if ((i % tprate) == 0) {
>> +                uint64_t tp1 = (curr - tp0) / 1000000;
>> +                tp = ((double) last_total_sent * 8.0 /
>> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
>> +                printf("throughput: %f mbps\n", tp);
> DPRINTF please ...
>> +                last_total_sent = 0;
>> +                tp0 = curr;
>> +            }
>>           }
>>           i++;
>>       }
>>   
>> +    if (migrate_rdma_enabled() && rdma_write_flush(&rdma_mdata) < 0) {
>> +        qemu_file_set_error(f, -EIO);
> see above ...
>> +        return 0;
>> +    }
>> +
>>       qemu_mutex_unlock_ramlist();
>>   
>>       if (ret < 0) {
>> diff --git a/savevm.c b/savevm.c
>> index 304d1ef..4d0bef3 100644
>> --- a/savevm.c
>> +++ b/savevm.c
>> @@ -24,6 +24,7 @@
>>   
>>   #include "config-host.h"
>>   #include "qemu-common.h"
>> +#include "qemu/rdma.h"
>>   #include "hw/hw.h"
>>   #include "hw/qdev.h"
>>   #include "net/net.h"
>> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>>       return f->last_error;
>>   }
>>   
>> -static void qemu_file_set_error(QEMUFile *f, int ret)
>> +void qemu_file_set_error(QEMUFile *f, int ret)
>>   {
>>       if (f->last_error == 0) {
>>           f->last_error = ret;
>> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>   {
>>       SaveStateEntry *se;
>>       int ret = 1;
>> +    static int first_time = 1;
>>   
>>       QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>>           if (!se->ops || !se->ops->save_live_iterate) {
>> @@ -1643,6 +1645,30 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>           }
>>       }
>>       if (ret != 0) {
>> +#ifdef RDMA_EXTRA_SYNC
>> +        /*
>> +         * We use two "sync" infiniband messages happen during migration.
>> +         * One at the beginning and one at the end, just to be thorough.
>> +         * This is the first one.
>> +         */
>> +        if (first_time && migrate_rdma_enabled()) {
> Can't this code move to qemu_savevm_state_begin or even better to ram_save_setup  ..
>> +            int r;
>> +            first_time = 0;
>> +            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
>> +                fprintf(stderr,
>> +                        "rdma migration: error posting extra send sync!\n");
>> +                return -EIO;
> Why not get the real error, please try to give more details on
> the actual error to allow the user to understand why migration failed.
> Also please update QemuFile error too.
>> +            }
>> +
>> +            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
>> +            if (r < 0) {
>> +                fprintf(stderr,
>> +                        "rdma migration: qemu_savevm_state_iterate"
>> +                        " sync polling error!\n");
>> +                return -EIO;
> same here ...
>> +            }
>> +        }
>> +#endif
>>           return ret;
>>       }
>>       ret = qemu_file_get_error(f);
>> @@ -1703,8 +1729,30 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>           trace_savevm_section_end(se->section_id);
>>       }
>>   
>> +    /*
>> +     * We use two "sync" infiniband messages happen during migration.
>> +     * One at the beginning and one at the end, just to be thorough.
>> +     * This is the second one.
>> +     */
>> +    if (migrate_rdma_enabled()) {
>> +        if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_SYNC)) {
>> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
>> +            return -EIO;
> can't this moved to ram_save_complete?
> Regards,
> Orit
>> +        }
>> +    }
>> +
>>       qemu_put_byte(f, QEMU_VM_EOF);
>>   
>> +    /* wait for RDMA sync message to complete */
>> +    if (migrate_rdma_enabled()) {
>> +        int ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_SYNC);
>> +        if (ret < 0) {
>> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
>> +                            " sync polling error!\n");
>> +            return -EIO;
> ...
>> +        }
>> +     }
>> +
>>       return qemu_file_get_error(f);
>>   }
>>   
>> @@ -2014,6 +2062,15 @@ int qemu_loadvm_state(QEMUFile *f)
>>   
>>       cpu_synchronize_all_post_init();
>>   
>> +    /* wait for RDMA sync message */
>> +    if (migrate_rdma_enabled()) {
>> +        ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_RECV_SYNC);
>> +        if (ret < 0) {
>> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
>> +                            " sync polling error!\n");
>> +            goto out;
>> +        }
>> +    }
>>       ret = 0;
>>   
>>   out:
>>
Paolo Bonzini - Feb. 18, 2013, 11:01 a.m.
Il 11/02/2013 23:49, Michael R. Hines ha scritto:
>  
> +    /*
> +     * RFC RDMA: The empirical cost of searching for zero pages here
> +     *           plus the cost of communicating with the other side
> +     *           seems to take significantly more time than simply
> +     *           dumping the page into remote memory.
> +     */
> +    if (migrate_rdma_enabled())
> +        return 0;

This is probably the only if (rdma) that should remain in the end. :)

>      for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
>          if (!ALL_EQ(val, p[i])) {
>              return 0;
> @@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
>      return size;
>  }
>  
> +static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
> +                             int cont)
> +{
> +    size_t bytes_sent = 0;
> +    ram_addr_t current_addr;
> +
> +    acct_info.norm_pages++;
> +
> +    /*
> +     * use RDMA to send page
> +     */
> +    current_addr = block->offset + offset;
> +    if (rdma_write(&rdma_mdata, current_addr,
> +            TARGET_PAGE_SIZE)) {
> +        fprintf(stderr, "rdma migration: write error!\n");
> +        qemu_file_set_error(f, -EIO);
> +        return 0;
> +    }
> +
> +    /*
> +     * do some polling
> +     */
> +    while (1) {
> +        int ret = rdma_poll(&rdma_mdata);
> +        if (ret == RDMA_WRID_NONE) {
> +            break;
> +        }
> +        if (ret < 0) {
> +            fprintf(stderr, "rdma migration: polling error!\n");
> +            qemu_file_set_error(f, -EIO);
> +            return 0;
> +        }
> +    }
> +
> +    bytes_sent += TARGET_PAGE_SIZE;
> +    return bytes_sent;
> +}

Orit, can you rebase and post an RFC of your vectored-send patches for
TCP migration?  Perhaps you and Michael can figure out an API that works
well for both TCP and RDMA.

> +#ifdef RDMA_EXTRA_SYNC
> +        /*
> +         * We use two "sync" infiniband messages happen during migration.
> +         * One at the beginning and one at the end, just to be thorough.
> +         * This is the first one.
> +         */
> +        if (first_time && migrate_rdma_enabled()) {
> +            int r;
> +            first_time = 0;
> +            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
> +                fprintf(stderr,
> +                        "rdma migration: error posting extra send sync!\n");
> +                return -EIO;
> +            }
> +
> +            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
> +            if (r < 0) {
> +                fprintf(stderr,
> +                        "rdma migration: qemu_savevm_state_iterate"
> +                        " sync polling error!\n");
> +                return -EIO;
> +            }
> +        }
> +#endif

This "sync" thing sounds like something that could be used to send
buffered device state on the same channel.  But again, I'm quite
RDMA-impaired. :)

Paolo
Orit Wasserman - Feb. 18, 2013, 1:10 p.m.
On 02/18/2013 01:01 PM, Paolo Bonzini wrote:
> Il 11/02/2013 23:49, Michael R. Hines ha scritto:
>>  
>> +    /*
>> +     * RFC RDMA: The empirical cost of searching for zero pages here
>> +     *           plus the cost of communicating with the other side
>> +     *           seems to take significantly more time than simply
>> +     *           dumping the page into remote memory.
>> +     */
>> +    if (migrate_rdma_enabled())
>> +        return 0;
> 
> This is probably the only if (rdma) that should remain in the end. :)
> 
>>      for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
>>          if (!ALL_EQ(val, p[i])) {
>>              return 0;
>> @@ -282,6 +292,44 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
>>      return size;
>>  }
>>  
>> +static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
>> +                             int cont)
>> +{
>> +    size_t bytes_sent = 0;
>> +    ram_addr_t current_addr;
>> +
>> +    acct_info.norm_pages++;
>> +
>> +    /*
>> +     * use RDMA to send page
>> +     */
>> +    current_addr = block->offset + offset;
>> +    if (rdma_write(&rdma_mdata, current_addr,
>> +            TARGET_PAGE_SIZE)) {
>> +        fprintf(stderr, "rdma migration: write error!\n");
>> +        qemu_file_set_error(f, -EIO);
>> +        return 0;
>> +    }
>> +
>> +    /*
>> +     * do some polling
>> +     */
>> +    while (1) {
>> +        int ret = rdma_poll(&rdma_mdata);
>> +        if (ret == RDMA_WRID_NONE) {
>> +            break;
>> +        }
>> +        if (ret < 0) {
>> +            fprintf(stderr, "rdma migration: polling error!\n");
>> +            qemu_file_set_error(f, -EIO);
>> +            return 0;
>> +        }
>> +    }
>> +
>> +    bytes_sent += TARGET_PAGE_SIZE;
>> +    return bytes_sent;
>> +}
> 
> Orit, can you rebase and post an RFC of your vectored-send patches for
> TCP migration?  Perhaps you and Michael can figure out an API that works
> well for both TCP and RDMA.
Paolo, 
I currently working on rebasing.
I need to think how to have an API that will work both for tcp and RDMA.

Orit
> 
>> +#ifdef RDMA_EXTRA_SYNC
>> +        /*
>> +         * We use two "sync" infiniband messages happen during migration.
>> +         * One at the beginning and one at the end, just to be thorough.
>> +         * This is the first one.
>> +         */
>> +        if (first_time && migrate_rdma_enabled()) {
>> +            int r;
>> +            first_time = 0;
>> +            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
>> +                fprintf(stderr,
>> +                        "rdma migration: error posting extra send sync!\n");
>> +                return -EIO;
>> +            }
>> +
>> +            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
>> +            if (r < 0) {
>> +                fprintf(stderr,
>> +                        "rdma migration: qemu_savevm_state_iterate"
>> +                        " sync polling error!\n");
>> +                return -EIO;
>> +            }
>> +        }
>> +#endif
> 
> This "sync" thing sounds like something that could be used to send
> buffered device state on the same channel.  But again, I'm quite
> RDMA-impaired. :)
> 
> Paolo
> 
>
mrhines@linux.vnet.ibm.com - Feb. 19, 2013, 6:11 a.m.
On 02/18/2013 06:01 AM, Paolo Bonzini wrote:
>
> Orit, can you rebase and post an RFC of your vectored-send patches for
> TCP migration?  Perhaps you and Michael can figure out an API that works
> well for both TCP and RDMA.
Looking forward to reading these.....
>> +#ifdef RDMA_EXTRA_SYNC
>> +        /*
>> +         * We use two "sync" infiniband messages happen during migration.
>> +         * One at the beginning and one at the end, just to be thorough.
>> +         * This is the first one.
>> +         */
>> +        }
>>
> This "sync" thing sounds like something that could be used to send
> buffered device state on the same channel.  But again, I'm quite
> RDMA-impaired. :)
>
> Paolo
>
Actually, we don't see anything fundamentally wrong with that - I'll 
watch for Orit's patches on the mailing list before coming up with a new 
design.

- Michael

Patch

diff --git a/arch_init.c b/arch_init.c
index dada6de..76092cc 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -42,6 +42,7 @@ 
 #include "migration/migration.h"
 #include "exec/gdbstub.h"
 #include "hw/smbios.h"
+#include "qemu/rdma.h"
 #include "exec/address-spaces.h"
 #include "hw/pcspk.h"
 #include "migration/page_cache.h"
@@ -170,6 +171,15 @@  static int is_dup_page(uint8_t *page)
     VECTYPE val = SPLAT(page);
     int i;
 
+    /*
+     * RFC RDMA: The empirical cost of searching for zero pages here
+     *           plus the cost of communicating with the other side
+     *           seems to take significantly more time than simply
+     *           dumping the page into remote memory.
+     */
+    if (migrate_rdma_enabled())
+        return 0;
+
     for (i = 0; i < TARGET_PAGE_SIZE / sizeof(VECTYPE); i++) {
         if (!ALL_EQ(val, p[i])) {
             return 0;
@@ -282,6 +292,44 @@  static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
     return size;
 }
 
+static size_t save_rdma_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
+                             int cont)
+{
+    size_t bytes_sent = 0;
+    ram_addr_t current_addr;
+
+    acct_info.norm_pages++;
+
+    /*
+     * use RDMA to send page
+     */
+    current_addr = block->offset + offset;
+    if (rdma_write(&rdma_mdata, current_addr,
+            TARGET_PAGE_SIZE)) {
+        fprintf(stderr, "rdma migration: write error!\n");
+        qemu_file_set_error(f, -EIO);
+        return 0;
+    }
+
+    /*
+     * do some polling
+     */
+    while (1) {
+        int ret = rdma_poll(&rdma_mdata);
+        if (ret == RDMA_WRID_NONE) {
+            break;
+        }
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: polling error!\n");
+            qemu_file_set_error(f, -EIO);
+            return 0;
+        }
+    }
+
+    bytes_sent += TARGET_PAGE_SIZE;
+    return bytes_sent;
+}
+
 #define ENCODING_FLAG_XBZRLE 0x1
 
 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
@@ -474,6 +522,8 @@  static int ram_save_block(QEMUFile *f, bool last_stage)
                 if (!last_stage) {
                     p = get_cached_data(XBZRLE.cache, current_addr);
                 }
+            } else if (migrate_rdma_enabled()) {
+                bytes_sent = save_rdma_page(f, block, offset, cont);
             }
 
             /* XBZRLE overflow or normal page */
@@ -601,12 +651,14 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     return 0;
 }
 
+static int tprate = 1000;
+
 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
     int ret;
     int i;
-    int64_t t0;
-    int total_sent = 0;
+    int64_t t0, tp0;
+    int total_sent = 0, last_total_sent = 0;
 
     qemu_mutex_lock_ramlist();
 
@@ -625,23 +677,49 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
             break;
         }
         total_sent += bytes_sent;
+        last_total_sent += bytes_sent;
         acct_info.iterations++;
         /* we want to check in the 1st loop, just in case it was the 1st time
            and we had to sync the dirty bitmap.
            qemu_get_clock_ns() is a bit expensive, so we only check each some
            iterations
         */
+
+        /*
+         * RFC RDMA: Can we have something like this to periodically print
+         *           out throughput? This is just a rough-sketch that 
+         *           partially worked for me. I assume there a better way 
+         *           that everyone would prefer. Perhaps we could set a QMP 
+         *           command that toggled a "periodic printing" option that 
+         *           allowed more details to be printed on stdout.....?
+         */
         if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
+            uint64_t curr = qemu_get_clock_ns(rt_clock);
+            uint64_t t1 = (curr - t0) / 1000000;
+            double tp;
             if (t1 > MAX_WAIT) {
                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
                 break;
             }
+
+            if ((i % tprate) == 0) {
+                uint64_t tp1 = (curr - tp0) / 1000000;
+                tp = ((double) last_total_sent * 8.0 /
+                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
+                printf("throughput: %f mbps\n", tp);
+                last_total_sent = 0;
+                tp0 = curr;
+            }
         }
         i++;
     }
 
+    if (migrate_rdma_enabled() && rdma_write_flush(&rdma_mdata) < 0) {
+        qemu_file_set_error(f, -EIO);
+        return 0;
+    }
+
     qemu_mutex_unlock_ramlist();
 
     if (ret < 0) {
diff --git a/savevm.c b/savevm.c
index 304d1ef..4d0bef3 100644
--- a/savevm.c
+++ b/savevm.c
@@ -24,6 +24,7 @@ 
 
 #include "config-host.h"
 #include "qemu-common.h"
+#include "qemu/rdma.h"
 #include "hw/hw.h"
 #include "hw/qdev.h"
 #include "net/net.h"
@@ -417,7 +418,7 @@  int qemu_file_get_error(QEMUFile *f)
     return f->last_error;
 }
 
-static void qemu_file_set_error(QEMUFile *f, int ret)
+void qemu_file_set_error(QEMUFile *f, int ret)
 {
     if (f->last_error == 0) {
         f->last_error = ret;
@@ -1613,6 +1614,7 @@  int qemu_savevm_state_iterate(QEMUFile *f)
 {
     SaveStateEntry *se;
     int ret = 1;
+    static int first_time = 1;
 
     QTAILQ_FOREACH(se, &savevm_handlers, entry) {
         if (!se->ops || !se->ops->save_live_iterate) {
@@ -1643,6 +1645,30 @@  int qemu_savevm_state_iterate(QEMUFile *f)
         }
     }
     if (ret != 0) {
+#ifdef RDMA_EXTRA_SYNC
+        /*
+         * We use two "sync" infiniband messages happen during migration.
+         * One at the beginning and one at the end, just to be thorough.
+         * This is the first one.
+         */
+        if (first_time && migrate_rdma_enabled()) {
+            int r;
+            first_time = 0;
+            if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC)) {
+                fprintf(stderr,
+                        "rdma migration: error posting extra send sync!\n");
+                return -EIO;
+            }
+
+            r = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_EXTRA_SYNC);
+            if (r < 0) {
+                fprintf(stderr,
+                        "rdma migration: qemu_savevm_state_iterate"
+                        " sync polling error!\n");
+                return -EIO;
+            }
+        }
+#endif
         return ret;
     }
     ret = qemu_file_get_error(f);
@@ -1703,8 +1729,30 @@  int qemu_savevm_state_complete(QEMUFile *f)
         trace_savevm_section_end(se->section_id);
     }
 
+    /*
+     * We use two "sync" infiniband messages happen during migration.
+     * One at the beginning and one at the end, just to be thorough.
+     * This is the second one.
+     */
+    if (migrate_rdma_enabled()) {
+        if (rdma_post_send_sync(&rdma_mdata, RDMA_WRID_SEND_SYNC)) {
+            fprintf(stderr, "rdma migration: error posting send sync!\n");
+            return -EIO;
+        }
+    }
+
     qemu_put_byte(f, QEMU_VM_EOF);
 
+    /* wait for RDMA sync message to complete */
+    if (migrate_rdma_enabled()) {
+        int ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_SEND_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
+                            " sync polling error!\n");
+            return -EIO;
+        }
+     }
+
     return qemu_file_get_error(f);
 }
 
@@ -2014,6 +2062,15 @@  int qemu_loadvm_state(QEMUFile *f)
 
     cpu_synchronize_all_post_init();
 
+    /* wait for RDMA sync message */
+    if (migrate_rdma_enabled()) {
+        ret = rdma_wait_for_wrid(&rdma_mdata, RDMA_WRID_RECV_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
+                            " sync polling error!\n");
+            goto out;
+        }
+    }
     ret = 0;
 
 out: