diff mbox

[RFC,RDMA,support,v1:,5/5] send memory over RDMA as blocks are iterated

Message ID 1359410505-4715-5-git-send-email-mrhines@linux.vnet.ibm.com
State New
Headers show

Commit Message

mrhines@linux.vnet.ibm.com Jan. 28, 2013, 10:01 p.m. UTC
From: "Michael R. Hines" <mrhines@us.ibm.com>


Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
 include/migration/qemu-file.h |    1 +
 savevm.c                      |   90 +++++++++++++++++++++++++++-----
 3 files changed, 189 insertions(+), 18 deletions(-)

Comments

Orit Wasserman Jan. 31, 2013, 11:04 a.m. UTC | #1
Hi Michael,
I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
You don't do any decoding in the destination.

I would suggest creating a QEMUFileRDMA and moving the write/read code
You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer.

you also have some white space damage in the beginning of savevm.c.

Regards,
Orit

On 01/29/2013 12:01 AM, mrhines@linux.vnet.ibm.com wrote
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
>  include/migration/qemu-file.h |    1 +
>  savevm.c                      |   90 +++++++++++++++++++++++++++-----
>  3 files changed, 189 insertions(+), 18 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index dada6de..7633fa6 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -42,6 +42,7 @@
>  #include "migration/migration.h"
>  #include "exec/gdbstub.h"
>  #include "hw/smbios.h"
> +#include "qemu/rdma.h"
>  #include "exec/address-spaces.h"
>  #include "hw/pcspk.h"
>  #include "migration/page_cache.h"
> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
>  #define RAM_SAVE_FLAG_EOS      0x10
>  #define RAM_SAVE_FLAG_CONTINUE 0x20
>  #define RAM_SAVE_FLAG_XBZRLE   0x40
> +#define RAM_SAVE_FLAG_RDMA     0x80
>  
>  #ifdef __ALTIVEC__
>  #include <altivec.h>
> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>      int bytes_sent = 0;
>      MemoryRegion *mr;
>      ram_addr_t current_addr;
> +    static int not_sent = 1;
>  
>      if (!block)
>          block = QTAILQ_FIRST(&ram_list.blocks);
> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>              int cont = (block == last_sent_block) ?
>                  RAM_SAVE_FLAG_CONTINUE : 0;
>  
> +            current_addr = block->offset + offset;
>              p = memory_region_get_ram_ptr(mr) + offset;
>  
>              /* In doubt sent page as normal */
>              bytes_sent = -1;
> -            if (is_dup_page(p)) {
> +
> +            /*
> +             * RFC RDMA: The empirical cost of searching for zero pages here
> +             *           plus the cost of communicating with the other side
> +             *           seems to take significantly more time than simply
> +             *           dumping the page into remote memory.
> +             */
> +            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
>                  acct_info.dup_pages++;
>                  bytes_sent = save_block_hdr(f, block, offset, cont,
>                                              RAM_SAVE_FLAG_COMPRESS);
>                  qemu_put_byte(f, *p);
>                  bytes_sent += 1;
> +            /*
> +             * RFC RDMA: Same comment as above. time(run-length encoding)
> +             *           + time(communication) is too big. RDMA throughput tanks
> +             *           when this feature is enabled. But there's no need
> +             *           to change the code since the feature is optional.
> +             */
>              } else if (migrate_use_xbzrle()) {
> -                current_addr = block->offset + offset;
>                  bytes_sent = save_xbzrle_page(f, p, current_addr, block,
>                                                offset, cont, last_stage);
>                  if (!last_stage) {
>                      p = get_cached_data(XBZRLE.cache, current_addr);
>                  }
> +            } else if (qemu_rdma_migration_enabled()) {
> +                int ret;
> +
> +                /*
> +                 * RFC RDMA: This bad hack was to cause the loop on the
> +                 *           receiving side to break. Comments are welcome
> +                 *           on how to get rid of it.
> +                 */
> +                if (not_sent == 1) {
> +                        not_sent = 0;
> +                        bytes_sent = save_block_hdr(f, block, offset,
> +                                                cont, RAM_SAVE_FLAG_RDMA);
> +                }
> +                acct_info.norm_pages++;
> +                /*
> +                 * use RDMA to send page
> +                 */
> +                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
> +                        TARGET_PAGE_SIZE)) {
> +                    fprintf(stderr, "rdma migration: write error!\n");
> +                    qemu_file_set_error(f, -EIO);
> +                    return 0;
> +                }
> +
> +                /*
> +                 * do some polling
> +                 */
> +                while (1) {
> +                    ret = qemu_rdma_migration_poll(&rdma_mdata);
> +                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
> +                        break;
> +                    }
> +                    if (ret < 0) {
> +                        fprintf(stderr, "rdma migration: polling error!\n");
> +                        qemu_file_set_error(f, -EIO);
> +                        return 0;
> +                    }
> +                }
> +                bytes_sent += TARGET_PAGE_SIZE;
>              }
>  
>              /* XBZRLE overflow or normal page */
> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      return 0;
>  }
>  
> +
> +int tprate = 1000;
> +
>  static int ram_save_iterate(QEMUFile *f, void *opaque)
>  {
>      int ret;
>      int i;
> -    int64_t t0;
> -    int total_sent = 0;
> +    int64_t t0, tp0;
> +    int total_sent = 0, last_total_sent = 0;
>  
>      qemu_mutex_lock_ramlist();
>  
> @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>              break;
>          }
>          total_sent += bytes_sent;
> +        last_total_sent += bytes_sent;
>          acct_info.iterations++;
>          /* we want to check in the 1st loop, just in case it was the 1st time
>             and we had to sync the dirty bitmap.
>             qemu_get_clock_ns() is a bit expensive, so we only check each some
>             iterations
>          */
> +
> +        /*
> +         * RFC RDMA: Can we have something like this to periodically print
> +         * out throughput.
> +         * This is just a rough-sketch that partially worked for me.
> +         * I assume there a better way that everyone would prefer.
> +         * Perhaps we could set a QMP command that toggled a "periodic printing"
> +         * option that allowed more details to be printed on stdout.....?
> +         */
>          if ((i & 63) == 0) {
> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
> +            uint64_t t1 = (curr - t0) / 1000000;
> +            double tp;
>              if (t1 > MAX_WAIT) {
>                  DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>                          t1, i);
>                  break;
>              }
> +
> +            if ((i % tprate) == 0) {
> +                uint64_t tp1 = (curr - tp0) / 1000000;
> +                tp = ((double) last_total_sent * 8.0 /
> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
> +                printf("throughput: %f mbps\n", tp);
> +                last_total_sent = 0;
> +                tp0 = curr;
> +            }
>          }
>          i++;
>      }
>  
> +    /* flush buffer write */
> +    if (qemu_rdma_migration_enabled()) {
> +        int resp;
> +        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
> +        if (resp < 0) {
> +            fprintf(stderr, "rdma migration: write flush error!\n");
> +            qemu_file_set_error(f, -EIO);
> +            return 0;
> +        }
> +    }
> +
>      qemu_mutex_unlock_ramlist();
>  
>      if (ret < 0) {
> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>                  ret = -EINVAL;
>                  goto done;
>              }
> +        } else if (flags & RAM_SAVE_FLAG_RDMA) {
> +            /*
> +             * RFC RDMA: This bad hack was to cause the loop break.
> +             *           Comments are welcome on how to get rid of it.
> +             *           Communicating here is unnecessary because the
> +             *           RDMA page has already arrived.
> +             *           Comments are welcome on how to get rif of this.
> +             */
> +            if (!qemu_rdma_migration_enabled()) {
> +                return -EINVAL;
> +            }
> +            void *host = host_from_stream_offset(f, addr, flags);
> +            if (!host) {
> +                return -EINVAL;
> +            }
> +            /* rdma page is already here, nothing to do */
>          }
>          error = qemu_file_get_error(f);
>          if (error) {
> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
> index 68deefb..7c9968e 100644
> --- a/include/migration/qemu-file.h
> +++ b/include/migration/qemu-file.h
> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
>  int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
>  int64_t qemu_file_get_rate_limit(QEMUFile *f);
>  int qemu_file_get_error(QEMUFile *f);
> +void qemu_file_set_error(QEMUFile *f, int ret);
>  
>  static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>  {
> diff --git a/savevm.c b/savevm.c
> index 304d1ef..071196e 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -24,6 +24,7 @@
>  
>  #include "config-host.h"
>  #include "qemu-common.h"
> +#include "qemu/rdma.h"
>  #include "hw/hw.h"
>  #include "hw/qdev.h"
>  #include "net/net.h"
> @@ -50,7 +51,7 @@
>  #define ARP_OP_REQUEST_REV 0x3
>  
>  static int announce_self_create(uint8_t *buf,
> -				uint8_t *mac_addr)
> +                                uint8_t *mac_addr)
>  {
>      /* Ethernet header. */
>      memset(buf, 0xff, 6);         /* destination MAC addr */
> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
>          qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
>                         50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
>      } else {
> -	    qemu_del_timer(timer);
> -	    qemu_free_timer(timer);
> +            qemu_del_timer(timer);
> +            qemu_free_timer(timer);
>      }
>  }
>  
>  void qemu_announce_self(void)
>  {
> -	static QEMUTimer *timer;
> -	timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
> -	qemu_announce_self_once(&timer);
> +        static QEMUTimer *timer;
> +        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
> +        qemu_announce_self_once(&timer);
>  }
>  
>  /***********************************************************/
> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>      QEMUFileStdio *s;
>  
>      if (mode == NULL ||
> -	(mode[0] != 'r' && mode[0] != 'w') ||
> -	mode[1] != 'b' || mode[2] != 0) {
> +        (mode[0] != 'r' && mode[0] != 'w') ||
> +        mode[1] != 'b' || mode[2] != 0) {
>          fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
>          return NULL;
>      }
> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>      QEMUFileStdio *s;
>  
>      if (mode == NULL ||
> -	(mode[0] != 'r' && mode[0] != 'w') ||
> -	mode[1] != 'b' || mode[2] != 0) {
> +        (mode[0] != 'r' && mode[0] != 'w') ||
> +        mode[1] != 'b' || mode[2] != 0) {
>          fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
>          return NULL;
>      }
> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>      return f->last_error;
>  }
>  
> -static void qemu_file_set_error(QEMUFile *f, int ret)
> +void qemu_file_set_error(QEMUFile *f, int ret)
>  {
>      if (f->last_error == 0) {
>          f->last_error = ret;
> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>  {
>      SaveStateEntry *se;
>      int ret = 1;
> +    static int first_time = 1;
>  
>      QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>          if (!se->ops || !se->ops->save_live_iterate) {
> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>          }
>      }
>      if (ret != 0) {
> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
> +        /*
> +         * We use two "sync" infiniband messages happen during migration.
> +         * One at the beginning and one at the end, just to be thorough.
> +         * This is the first one.
> +         */
> +        if (first_time && qemu_rdma_migration_enabled()) {
> +            int r;
> +            first_time = 0;
> +            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
> +                fprintf(stderr,
> +                        "rdma migration: error posting extra send sync!\n");
> +                return -EIO;
> +            }
> +
> +            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
> +            if (r < 0) {
> +                fprintf(stderr,
> +                        "rdma migration: qemu_savevm_state_iterate"
> +                        " sync polling error!\n");
> +                return -EIO;
> +            }
> +        }
> +#endif
> +
>          return ret;
>      }
> +
>      ret = qemu_file_get_error(f);
>      if (ret != 0) {
>          qemu_savevm_state_cancel();
> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
>          int len;
>  
>          if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
> -	    continue;
> +            continue;
>          }
>          trace_savevm_section_start();
>          /* Section type */
> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
>          trace_savevm_section_end(se->section_id);
>      }
>  
> +    /*
> +     * We use two "sync" infiniband messages happen during migration.
> +     * One at the beginning and one at the end, just to be thorough.
> +     * This is the second one.
> +     */
> +    if (qemu_rdma_migration_enabled()) {
> +        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
> +                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
> +            return -EIO;
> +        }
> +    }
> +
>      qemu_put_byte(f, QEMU_VM_EOF);
>  
> +    /* wait for RDMA sync message to complete */
> +    if (qemu_rdma_migration_enabled()) {
> +        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> +                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
> +        if (ret < 0) {
> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
> +                            " sync polling error!\n");
> +            return -EIO;
> +        }
> +     }
> +
>      return qemu_file_get_error(f);
>  }
>  
> @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>  
>      cpu_synchronize_all_post_init();
>  
> -    ret = 0;
> +    /* wait for RDMA sync message */
> +    if (qemu_rdma_migration_enabled()) {
> +        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
> +                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
> +        if (ret < 0) {
> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
> +                            " sync polling error!\n");
> +            goto out;
> +        }
> +    }
>  
> +    ret = 0;
>  out:
>      QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
>          QLIST_REMOVE(le, entry);
>
mrhines@linux.vnet.ibm.com Jan. 31, 2013, 3:08 p.m. UTC | #2
Yes, I was hoping for a comment about this in particular (before I send 
out another patchest with the proper coverletter and so forth).

So here's the problem I was experiencing inside savevm.c, 
qemu_loadvm_state():

I was having a little trouble serializing the client/server protocol in 
my brain.

When the server-side begins loading pages into ram, qemu_loadvm_state() 
goes into a tight loop waiting for each memory page to be transmitted, 
one-by-one.

Now, for RDMA, this loop is not necessary, but the loop was stuck 
waiting for TCP messages that did not need to be sent. So, the extra 
flag you saw was a hack to break out of the loop

.... but according to you, I should bypass this loop entirely?
Should I write a brand new function in savevm.c which skips this function?

Also, with the QEMUFileRDMA, I have a question: Since RDMA does not 
require a file-like abstraction of any kind (meaning there is no 
explicit handshaking during an RDMA transfer), should I really create 
one of these? Unlike sockets and snapshot files that a typical migration 
would normally need, an RDMA-based migration doesn't operate this way 
anymore.

Thanks for all the comments ....... keep them coming =)

- Michael


On 01/31/2013 06:04 AM, Orit Wasserman wrote:
> Hi Michael,
> I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
> You don't do any decoding in the destination.
>
> I would suggest creating a QEMUFileRDMA and moving the write/read code
> You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer.
>
> you also have some white space damage in the beginning of savevm.c.
>
> Regards,
> Orit
>
> On 01/29/2013 12:01 AM, mrhines@linux.vnet.ibm.com wrote
>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>
>>
>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>> ---
>>   arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
>>   include/migration/qemu-file.h |    1 +
>>   savevm.c                      |   90 +++++++++++++++++++++++++++-----
>>   3 files changed, 189 insertions(+), 18 deletions(-)
>>
>> diff --git a/arch_init.c b/arch_init.c
>> index dada6de..7633fa6 100644
>> --- a/arch_init.c
>> +++ b/arch_init.c
>> @@ -42,6 +42,7 @@
>>   #include "migration/migration.h"
>>   #include "exec/gdbstub.h"
>>   #include "hw/smbios.h"
>> +#include "qemu/rdma.h"
>>   #include "exec/address-spaces.h"
>>   #include "hw/pcspk.h"
>>   #include "migration/page_cache.h"
>> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
>>   #define RAM_SAVE_FLAG_EOS      0x10
>>   #define RAM_SAVE_FLAG_CONTINUE 0x20
>>   #define RAM_SAVE_FLAG_XBZRLE   0x40
>> +#define RAM_SAVE_FLAG_RDMA     0x80
>>   
>>   #ifdef __ALTIVEC__
>>   #include <altivec.h>
>> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>       int bytes_sent = 0;
>>       MemoryRegion *mr;
>>       ram_addr_t current_addr;
>> +    static int not_sent = 1;
>>   
>>       if (!block)
>>           block = QTAILQ_FIRST(&ram_list.blocks);
>> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>               int cont = (block == last_sent_block) ?
>>                   RAM_SAVE_FLAG_CONTINUE : 0;
>>   
>> +            current_addr = block->offset + offset;
>>               p = memory_region_get_ram_ptr(mr) + offset;
>>   
>>               /* In doubt sent page as normal */
>>               bytes_sent = -1;
>> -            if (is_dup_page(p)) {
>> +
>> +            /*
>> +             * RFC RDMA: The empirical cost of searching for zero pages here
>> +             *           plus the cost of communicating with the other side
>> +             *           seems to take significantly more time than simply
>> +             *           dumping the page into remote memory.
>> +             */
>> +            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
>>                   acct_info.dup_pages++;
>>                   bytes_sent = save_block_hdr(f, block, offset, cont,
>>                                               RAM_SAVE_FLAG_COMPRESS);
>>                   qemu_put_byte(f, *p);
>>                   bytes_sent += 1;
>> +            /*
>> +             * RFC RDMA: Same comment as above. time(run-length encoding)
>> +             *           + time(communication) is too big. RDMA throughput tanks
>> +             *           when this feature is enabled. But there's no need
>> +             *           to change the code since the feature is optional.
>> +             */
>>               } else if (migrate_use_xbzrle()) {
>> -                current_addr = block->offset + offset;
>>                   bytes_sent = save_xbzrle_page(f, p, current_addr, block,
>>                                                 offset, cont, last_stage);
>>                   if (!last_stage) {
>>                       p = get_cached_data(XBZRLE.cache, current_addr);
>>                   }
>> +            } else if (qemu_rdma_migration_enabled()) {
>> +                int ret;
>> +
>> +                /*
>> +                 * RFC RDMA: This bad hack was to cause the loop on the
>> +                 *           receiving side to break. Comments are welcome
>> +                 *           on how to get rid of it.
>> +                 */
>> +                if (not_sent == 1) {
>> +                        not_sent = 0;
>> +                        bytes_sent = save_block_hdr(f, block, offset,
>> +                                                cont, RAM_SAVE_FLAG_RDMA);
>> +                }
>> +                acct_info.norm_pages++;
>> +                /*
>> +                 * use RDMA to send page
>> +                 */
>> +                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
>> +                        TARGET_PAGE_SIZE)) {
>> +                    fprintf(stderr, "rdma migration: write error!\n");
>> +                    qemu_file_set_error(f, -EIO);
>> +                    return 0;
>> +                }
>> +
>> +                /*
>> +                 * do some polling
>> +                 */
>> +                while (1) {
>> +                    ret = qemu_rdma_migration_poll(&rdma_mdata);
>> +                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
>> +                        break;
>> +                    }
>> +                    if (ret < 0) {
>> +                        fprintf(stderr, "rdma migration: polling error!\n");
>> +                        qemu_file_set_error(f, -EIO);
>> +                        return 0;
>> +                    }
>> +                }
>> +                bytes_sent += TARGET_PAGE_SIZE;
>>               }
>>   
>>               /* XBZRLE overflow or normal page */
>> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>       return 0;
>>   }
>>   
>> +
>> +int tprate = 1000;
>> +
>>   static int ram_save_iterate(QEMUFile *f, void *opaque)
>>   {
>>       int ret;
>>       int i;
>> -    int64_t t0;
>> -    int total_sent = 0;
>> +    int64_t t0, tp0;
>> +    int total_sent = 0, last_total_sent = 0;
>>   
>>       qemu_mutex_lock_ramlist();
>>   
>> @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>               break;
>>           }
>>           total_sent += bytes_sent;
>> +        last_total_sent += bytes_sent;
>>           acct_info.iterations++;
>>           /* we want to check in the 1st loop, just in case it was the 1st time
>>              and we had to sync the dirty bitmap.
>>              qemu_get_clock_ns() is a bit expensive, so we only check each some
>>              iterations
>>           */
>> +
>> +        /*
>> +         * RFC RDMA: Can we have something like this to periodically print
>> +         * out throughput.
>> +         * This is just a rough-sketch that partially worked for me.
>> +         * I assume there a better way that everyone would prefer.
>> +         * Perhaps we could set a QMP command that toggled a "periodic printing"
>> +         * option that allowed more details to be printed on stdout.....?
>> +         */
>>           if ((i & 63) == 0) {
>> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
>> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
>> +            uint64_t t1 = (curr - t0) / 1000000;
>> +            double tp;
>>               if (t1 > MAX_WAIT) {
>>                   DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>>                           t1, i);
>>                   break;
>>               }
>> +
>> +            if ((i % tprate) == 0) {
>> +                uint64_t tp1 = (curr - tp0) / 1000000;
>> +                tp = ((double) last_total_sent * 8.0 /
>> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
>> +                printf("throughput: %f mbps\n", tp);
>> +                last_total_sent = 0;
>> +                tp0 = curr;
>> +            }
>>           }
>>           i++;
>>       }
>>   
>> +    /* flush buffer write */
>> +    if (qemu_rdma_migration_enabled()) {
>> +        int resp;
>> +        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
>> +        if (resp < 0) {
>> +            fprintf(stderr, "rdma migration: write flush error!\n");
>> +            qemu_file_set_error(f, -EIO);
>> +            return 0;
>> +        }
>> +    }
>> +
>>       qemu_mutex_unlock_ramlist();
>>   
>>       if (ret < 0) {
>> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>>                   ret = -EINVAL;
>>                   goto done;
>>               }
>> +        } else if (flags & RAM_SAVE_FLAG_RDMA) {
>> +            /*
>> +             * RFC RDMA: This bad hack was to cause the loop break.
>> +             *           Comments are welcome on how to get rid of it.
>> +             *           Communicating here is unnecessary because the
>> +             *           RDMA page has already arrived.
>> +             *           Comments are welcome on how to get rif of this.
>> +             */
>> +            if (!qemu_rdma_migration_enabled()) {
>> +                return -EINVAL;
>> +            }
>> +            void *host = host_from_stream_offset(f, addr, flags);
>> +            if (!host) {
>> +                return -EINVAL;
>> +            }
>> +            /* rdma page is already here, nothing to do */
>>           }
>>           error = qemu_file_get_error(f);
>>           if (error) {
>> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
>> index 68deefb..7c9968e 100644
>> --- a/include/migration/qemu-file.h
>> +++ b/include/migration/qemu-file.h
>> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
>>   int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
>>   int64_t qemu_file_get_rate_limit(QEMUFile *f);
>>   int qemu_file_get_error(QEMUFile *f);
>> +void qemu_file_set_error(QEMUFile *f, int ret);
>>   
>>   static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>>   {
>> diff --git a/savevm.c b/savevm.c
>> index 304d1ef..071196e 100644
>> --- a/savevm.c
>> +++ b/savevm.c
>> @@ -24,6 +24,7 @@
>>   
>>   #include "config-host.h"
>>   #include "qemu-common.h"
>> +#include "qemu/rdma.h"
>>   #include "hw/hw.h"
>>   #include "hw/qdev.h"
>>   #include "net/net.h"
>> @@ -50,7 +51,7 @@
>>   #define ARP_OP_REQUEST_REV 0x3
>>   
>>   static int announce_self_create(uint8_t *buf,
>> -				uint8_t *mac_addr)
>> +                                uint8_t *mac_addr)
>>   {
>>       /* Ethernet header. */
>>       memset(buf, 0xff, 6);         /* destination MAC addr */
>> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
>>           qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
>>                          50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
>>       } else {
>> -	    qemu_del_timer(timer);
>> -	    qemu_free_timer(timer);
>> +            qemu_del_timer(timer);
>> +            qemu_free_timer(timer);
>>       }
>>   }
>>   
>>   void qemu_announce_self(void)
>>   {
>> -	static QEMUTimer *timer;
>> -	timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>> -	qemu_announce_self_once(&timer);
>> +        static QEMUTimer *timer;
>> +        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>> +        qemu_announce_self_once(&timer);
>>   }
>>   
>>   /***********************************************************/
>> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>>       QEMUFileStdio *s;
>>   
>>       if (mode == NULL ||
>> -	(mode[0] != 'r' && mode[0] != 'w') ||
>> -	mode[1] != 'b' || mode[2] != 0) {
>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>> +        mode[1] != 'b' || mode[2] != 0) {
>>           fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
>>           return NULL;
>>       }
>> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>>       QEMUFileStdio *s;
>>   
>>       if (mode == NULL ||
>> -	(mode[0] != 'r' && mode[0] != 'w') ||
>> -	mode[1] != 'b' || mode[2] != 0) {
>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>> +        mode[1] != 'b' || mode[2] != 0) {
>>           fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
>>           return NULL;
>>       }
>> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>>       return f->last_error;
>>   }
>>   
>> -static void qemu_file_set_error(QEMUFile *f, int ret)
>> +void qemu_file_set_error(QEMUFile *f, int ret)
>>   {
>>       if (f->last_error == 0) {
>>           f->last_error = ret;
>> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>   {
>>       SaveStateEntry *se;
>>       int ret = 1;
>> +    static int first_time = 1;
>>   
>>       QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>>           if (!se->ops || !se->ops->save_live_iterate) {
>> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>           }
>>       }
>>       if (ret != 0) {
>> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
>> +        /*
>> +         * We use two "sync" infiniband messages happen during migration.
>> +         * One at the beginning and one at the end, just to be thorough.
>> +         * This is the first one.
>> +         */
>> +        if (first_time && qemu_rdma_migration_enabled()) {
>> +            int r;
>> +            first_time = 0;
>> +            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
>> +                fprintf(stderr,
>> +                        "rdma migration: error posting extra send sync!\n");
>> +                return -EIO;
>> +            }
>> +
>> +            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
>> +            if (r < 0) {
>> +                fprintf(stderr,
>> +                        "rdma migration: qemu_savevm_state_iterate"
>> +                        " sync polling error!\n");
>> +                return -EIO;
>> +            }
>> +        }
>> +#endif
>> +
>>           return ret;
>>       }
>> +
>>       ret = qemu_file_get_error(f);
>>       if (ret != 0) {
>>           qemu_savevm_state_cancel();
>> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>           int len;
>>   
>>           if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
>> -	    continue;
>> +            continue;
>>           }
>>           trace_savevm_section_start();
>>           /* Section type */
>> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>           trace_savevm_section_end(se->section_id);
>>       }
>>   
>> +    /*
>> +     * We use two "sync" infiniband messages happen during migration.
>> +     * One at the beginning and one at the end, just to be thorough.
>> +     * This is the second one.
>> +     */
>> +    if (qemu_rdma_migration_enabled()) {
>> +        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>> +                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
>> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
>> +            return -EIO;
>> +        }
>> +    }
>> +
>>       qemu_put_byte(f, QEMU_VM_EOF);
>>   
>> +    /* wait for RDMA sync message to complete */
>> +    if (qemu_rdma_migration_enabled()) {
>> +        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>> +                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
>> +        if (ret < 0) {
>> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
>> +                            " sync polling error!\n");
>> +            return -EIO;
>> +        }
>> +     }
>> +
>>       return qemu_file_get_error(f);
>>   }
>>   
>> @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>>   
>>       cpu_synchronize_all_post_init();
>>   
>> -    ret = 0;
>> +    /* wait for RDMA sync message */
>> +    if (qemu_rdma_migration_enabled()) {
>> +        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>> +                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
>> +        if (ret < 0) {
>> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
>> +                            " sync polling error!\n");
>> +            goto out;
>> +        }
>> +    }
>>   
>> +    ret = 0;
>>   out:
>>       QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
>>           QLIST_REMOVE(le, entry);
>>
>
Orit Wasserman Jan. 31, 2013, 6:56 p.m. UTC | #3
On 01/31/2013 05:08 PM, Michael R. hines wrote:
> Yes, I was hoping for a comment about this in particular (before I send out another patchest with the proper coverletter and so forth).
> 
> So here's the problem I was experiencing inside savevm.c, qemu_loadvm_state():
> 
> I was having a little trouble serializing the client/server protocol in my brain.
> 
> When the server-side begins loading pages into ram, qemu_loadvm_state() goes into a tight loop waiting for each memory page to be transmitted, one-by-one.
> 
> Now, for RDMA, this loop is not necessary, but the loop was stuck waiting for TCP messages that did not need to be sent. So, the extra flag you saw was a hack to break out of the loop
> 
> .... but according to you, I should bypass this loop entirely?
> Should I write a brand new function in savevm.c which skips this function?
no the pages are not sent one by one but actually are buffered (qemu_put_buffer writes them into a buffer).
This is done to ensure migration won't exceed it speed limit - i.e bandwidth capping.
You will need it also for RDMA, as the bandwidth of the RDMA is shared with guests, other migration processes and the host.

You should not bypass the loop as you need to mark pages transferred as clean in the bitmap,
in order to exit the loop in ram_save_block just set bytes_sent to the page size which is what you are sending on the wire.
It is also uesd to calculated the amount of data sent during migration.
>
> Also, with the QEMUFileRDMA, I have a question: Since RDMA does not require a file-like abstraction of any kind (meaning there is no explicit handshaking during an RDMA transfer), should I really create one of these? Unlike sockets and snapshot files that a typical migration would normally need, an RDMA-based migration doesn't operate this way anymore.
Not sure I understand you question but you don't have to implement all the ops.

Cheers,
Orit
> 
> Thanks for all the comments ....... keep them coming =)
> 
> - Michael
> 
> 
> On 01/31/2013 06:04 AM, Orit Wasserman wrote:
>> Hi Michael,
>> I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
>> You don't do any decoding in the destination.
>>
>> I would suggest creating a QEMUFileRDMA and moving the write/read code
>> You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer.
>>
>> you also have some white space damage in the beginning of savevm.c.
>>
>> Regards,
>> Orit
>>
>> On 01/29/2013 12:01 AM, mrhines@linux.vnet.ibm.com wrote
>>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>>
>>>
>>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>>> ---
>>>   arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
>>>   include/migration/qemu-file.h |    1 +
>>>   savevm.c                      |   90 +++++++++++++++++++++++++++-----
>>>   3 files changed, 189 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/arch_init.c b/arch_init.c
>>> index dada6de..7633fa6 100644
>>> --- a/arch_init.c
>>> +++ b/arch_init.c
>>> @@ -42,6 +42,7 @@
>>>   #include "migration/migration.h"
>>>   #include "exec/gdbstub.h"
>>>   #include "hw/smbios.h"
>>> +#include "qemu/rdma.h"
>>>   #include "exec/address-spaces.h"
>>>   #include "hw/pcspk.h"
>>>   #include "migration/page_cache.h"
>>> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
>>>   #define RAM_SAVE_FLAG_EOS      0x10
>>>   #define RAM_SAVE_FLAG_CONTINUE 0x20
>>>   #define RAM_SAVE_FLAG_XBZRLE   0x40
>>> +#define RAM_SAVE_FLAG_RDMA     0x80
>>>     #ifdef __ALTIVEC__
>>>   #include <altivec.h>
>>> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>       int bytes_sent = 0;
>>>       MemoryRegion *mr;
>>>       ram_addr_t current_addr;
>>> +    static int not_sent = 1;
>>>         if (!block)
>>>           block = QTAILQ_FIRST(&ram_list.blocks);
>>> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>               int cont = (block == last_sent_block) ?
>>>                   RAM_SAVE_FLAG_CONTINUE : 0;
>>>   +            current_addr = block->offset + offset;
>>>               p = memory_region_get_ram_ptr(mr) + offset;
>>>                 /* In doubt sent page as normal */
>>>               bytes_sent = -1;
>>> -            if (is_dup_page(p)) {
>>> +
>>> +            /*
>>> +             * RFC RDMA: The empirical cost of searching for zero pages here
>>> +             *           plus the cost of communicating with the other side
>>> +             *           seems to take significantly more time than simply
>>> +             *           dumping the page into remote memory.
>>> +             */
>>> +            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
>>>                   acct_info.dup_pages++;
>>>                   bytes_sent = save_block_hdr(f, block, offset, cont,
>>>                                               RAM_SAVE_FLAG_COMPRESS);
>>>                   qemu_put_byte(f, *p);
>>>                   bytes_sent += 1;
>>> +            /*
>>> +             * RFC RDMA: Same comment as above. time(run-length encoding)
>>> +             *           + time(communication) is too big. RDMA throughput tanks
>>> +             *           when this feature is enabled. But there's no need
>>> +             *           to change the code since the feature is optional.
>>> +             */
>>>               } else if (migrate_use_xbzrle()) {
>>> -                current_addr = block->offset + offset;
>>>                   bytes_sent = save_xbzrle_page(f, p, current_addr, block,
>>>                                                 offset, cont, last_stage);
>>>                   if (!last_stage) {
>>>                       p = get_cached_data(XBZRLE.cache, current_addr);
>>>                   }
>>> +            } else if (qemu_rdma_migration_enabled()) {
>>> +                int ret;
>>> +
>>> +                /*
>>> +                 * RFC RDMA: This bad hack was to cause the loop on the
>>> +                 *           receiving side to break. Comments are welcome
>>> +                 *           on how to get rid of it.
>>> +                 */
>>> +                if (not_sent == 1) {
>>> +                        not_sent = 0;
>>> +                        bytes_sent = save_block_hdr(f, block, offset,
>>> +                                                cont, RAM_SAVE_FLAG_RDMA);
>>> +                }
>>> +                acct_info.norm_pages++;
>>> +                /*
>>> +                 * use RDMA to send page
>>> +                 */
>>> +                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
>>> +                        TARGET_PAGE_SIZE)) {
>>> +                    fprintf(stderr, "rdma migration: write error!\n");
>>> +                    qemu_file_set_error(f, -EIO);
>>> +                    return 0;
>>> +                }
>>> +
>>> +                /*
>>> +                 * do some polling
>>> +                 */
>>> +                while (1) {
>>> +                    ret = qemu_rdma_migration_poll(&rdma_mdata);
>>> +                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
>>> +                        break;
>>> +                    }
>>> +                    if (ret < 0) {
>>> +                        fprintf(stderr, "rdma migration: polling error!\n");
>>> +                        qemu_file_set_error(f, -EIO);
>>> +                        return 0;
>>> +                    }
>>> +                }
>>> +                bytes_sent += TARGET_PAGE_SIZE;
>>>               }
>>>                 /* XBZRLE overflow or normal page */
>>> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>>       return 0;
>>>   }
>>>   +
>>> +int tprate = 1000;
>>> +
>>>   static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>   {
>>>       int ret;
>>>       int i;
>>> -    int64_t t0;
>>> -    int total_sent = 0;
>>> +    int64_t t0, tp0;
>>> +    int total_sent = 0, last_total_sent = 0;
>>>         qemu_mutex_lock_ramlist();
>>>   @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>               break;
>>>           }
>>>           total_sent += bytes_sent;
>>> +        last_total_sent += bytes_sent;
>>>           acct_info.iterations++;
>>>           /* we want to check in the 1st loop, just in case it was the 1st time
>>>              and we had to sync the dirty bitmap.
>>>              qemu_get_clock_ns() is a bit expensive, so we only check each some
>>>              iterations
>>>           */
>>> +
>>> +        /*
>>> +         * RFC RDMA: Can we have something like this to periodically print
>>> +         * out throughput.
>>> +         * This is just a rough-sketch that partially worked for me.
>>> +         * I assume there a better way that everyone would prefer.
>>> +         * Perhaps we could set a QMP command that toggled a "periodic printing"
>>> +         * option that allowed more details to be printed on stdout.....?
>>> +         */
>>>           if ((i & 63) == 0) {
>>> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
>>> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
>>> +            uint64_t t1 = (curr - t0) / 1000000;
>>> +            double tp;
>>>               if (t1 > MAX_WAIT) {
>>>                   DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>>>                           t1, i);
>>>                   break;
>>>               }
>>> +
>>> +            if ((i % tprate) == 0) {
>>> +                uint64_t tp1 = (curr - tp0) / 1000000;
>>> +                tp = ((double) last_total_sent * 8.0 /
>>> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
>>> +                printf("throughput: %f mbps\n", tp);
>>> +                last_total_sent = 0;
>>> +                tp0 = curr;
>>> +            }
>>>           }
>>>           i++;
>>>       }
>>>   +    /* flush buffer write */
>>> +    if (qemu_rdma_migration_enabled()) {
>>> +        int resp;
>>> +        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
>>> +        if (resp < 0) {
>>> +            fprintf(stderr, "rdma migration: write flush error!\n");
>>> +            qemu_file_set_error(f, -EIO);
>>> +            return 0;
>>> +        }
>>> +    }
>>> +
>>>       qemu_mutex_unlock_ramlist();
>>>         if (ret < 0) {
>>> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>>>                   ret = -EINVAL;
>>>                   goto done;
>>>               }
>>> +        } else if (flags & RAM_SAVE_FLAG_RDMA) {
>>> +            /*
>>> +             * RFC RDMA: This bad hack was to cause the loop break.
>>> +             *           Comments are welcome on how to get rid of it.
>>> +             *           Communicating here is unnecessary because the
>>> +             *           RDMA page has already arrived.
>>> +             *           Comments are welcome on how to get rif of this.
>>> +             */
>>> +            if (!qemu_rdma_migration_enabled()) {
>>> +                return -EINVAL;
>>> +            }
>>> +            void *host = host_from_stream_offset(f, addr, flags);
>>> +            if (!host) {
>>> +                return -EINVAL;
>>> +            }
>>> +            /* rdma page is already here, nothing to do */
>>>           }
>>>           error = qemu_file_get_error(f);
>>>           if (error) {
>>> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
>>> index 68deefb..7c9968e 100644
>>> --- a/include/migration/qemu-file.h
>>> +++ b/include/migration/qemu-file.h
>>> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
>>>   int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
>>>   int64_t qemu_file_get_rate_limit(QEMUFile *f);
>>>   int qemu_file_get_error(QEMUFile *f);
>>> +void qemu_file_set_error(QEMUFile *f, int ret);
>>>     static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>>>   {
>>> diff --git a/savevm.c b/savevm.c
>>> index 304d1ef..071196e 100644
>>> --- a/savevm.c
>>> +++ b/savevm.c
>>> @@ -24,6 +24,7 @@
>>>     #include "config-host.h"
>>>   #include "qemu-common.h"
>>> +#include "qemu/rdma.h"
>>>   #include "hw/hw.h"
>>>   #include "hw/qdev.h"
>>>   #include "net/net.h"
>>> @@ -50,7 +51,7 @@
>>>   #define ARP_OP_REQUEST_REV 0x3
>>>     static int announce_self_create(uint8_t *buf,
>>> -                uint8_t *mac_addr)
>>> +                                uint8_t *mac_addr)
>>>   {
>>>       /* Ethernet header. */
>>>       memset(buf, 0xff, 6);         /* destination MAC addr */
>>> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
>>>           qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
>>>                          50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
>>>       } else {
>>> -        qemu_del_timer(timer);
>>> -        qemu_free_timer(timer);
>>> +            qemu_del_timer(timer);
>>> +            qemu_free_timer(timer);
>>>       }
>>>   }
>>>     void qemu_announce_self(void)
>>>   {
>>> -    static QEMUTimer *timer;
>>> -    timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>> -    qemu_announce_self_once(&timer);
>>> +        static QEMUTimer *timer;
>>> +        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>> +        qemu_announce_self_once(&timer);
>>>   }
>>>     /***********************************************************/
>>> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>>>       QEMUFileStdio *s;
>>>         if (mode == NULL ||
>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>> -    mode[1] != 'b' || mode[2] != 0) {
>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>           fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
>>>           return NULL;
>>>       }
>>> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>>>       QEMUFileStdio *s;
>>>         if (mode == NULL ||
>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>> -    mode[1] != 'b' || mode[2] != 0) {
>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>           fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
>>>           return NULL;
>>>       }
>>> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>>>       return f->last_error;
>>>   }
>>>   -static void qemu_file_set_error(QEMUFile *f, int ret)
>>> +void qemu_file_set_error(QEMUFile *f, int ret)
>>>   {
>>>       if (f->last_error == 0) {
>>>           f->last_error = ret;
>>> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>   {
>>>       SaveStateEntry *se;
>>>       int ret = 1;
>>> +    static int first_time = 1;
>>>         QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>>>           if (!se->ops || !se->ops->save_live_iterate) {
>>> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>           }
>>>       }
>>>       if (ret != 0) {
>>> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
>>> +        /*
>>> +         * We use two "sync" infiniband messages happen during migration.
>>> +         * One at the beginning and one at the end, just to be thorough.
>>> +         * This is the first one.
>>> +         */
>>> +        if (first_time && qemu_rdma_migration_enabled()) {
>>> +            int r;
>>> +            first_time = 0;
>>> +            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
>>> +                fprintf(stderr,
>>> +                        "rdma migration: error posting extra send sync!\n");
>>> +                return -EIO;
>>> +            }
>>> +
>>> +            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
>>> +            if (r < 0) {
>>> +                fprintf(stderr,
>>> +                        "rdma migration: qemu_savevm_state_iterate"
>>> +                        " sync polling error!\n");
>>> +                return -EIO;
>>> +            }
>>> +        }
>>> +#endif
>>> +
>>>           return ret;
>>>       }
>>> +
>>>       ret = qemu_file_get_error(f);
>>>       if (ret != 0) {
>>>           qemu_savevm_state_cancel();
>>> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>           int len;
>>>             if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
>>> -        continue;
>>> +            continue;
>>>           }
>>>           trace_savevm_section_start();
>>>           /* Section type */
>>> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>           trace_savevm_section_end(se->section_id);
>>>       }
>>>   +    /*
>>> +     * We use two "sync" infiniband messages happen during migration.
>>> +     * One at the beginning and one at the end, just to be thorough.
>>> +     * This is the second one.
>>> +     */
>>> +    if (qemu_rdma_migration_enabled()) {
>>> +        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>> +                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
>>> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
>>> +            return -EIO;
>>> +        }
>>> +    }
>>> +
>>>       qemu_put_byte(f, QEMU_VM_EOF);
>>>   +    /* wait for RDMA sync message to complete */
>>> +    if (qemu_rdma_migration_enabled()) {
>>> +        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>> +                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
>>> +        if (ret < 0) {
>>> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
>>> +                            " sync polling error!\n");
>>> +            return -EIO;
>>> +        }
>>> +     }
>>> +
>>>       return qemu_file_get_error(f);
>>>   }
>>>   @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>>>         cpu_synchronize_all_post_init();
>>>   -    ret = 0;
>>> +    /* wait for RDMA sync message */
>>> +    if (qemu_rdma_migration_enabled()) {
>>> +        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>> +                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
>>> +        if (ret < 0) {
>>> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
>>> +                            " sync polling error!\n");
>>> +            goto out;
>>> +        }
>>> +    }
>>>   +    ret = 0;
>>>   out:
>>>       QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
>>>           QLIST_REMOVE(le, entry);
>>>
>>
> 
>
mrhines@linux.vnet.ibm.com Jan. 31, 2013, 7:10 p.m. UTC | #4
Sorry, I didn't go into enough detail about the problem I'm having in 
the loop:

The loop that's not breaking is inside qemu_loadvm_state(), not 
ram_save_block().

This do-while loop is not exiting....... is it necessary for me to 
maintain this loop for RDMA purposes? Since there is explicit 
synchronization needed at user-level to transmit an RDMA page?

Additionally: If I have to write a page into a new QEMUFileRDMA 
abstraction for bandwidth accounting purposes, this would 
*significantly* slow down the performance advantages of RDMA.

Is there a way to do the accounting without doing any additional memory 
copies?

(If I'm understanding the abstraction properly)....

- Michael

On 01/31/2013 01:56 PM, Orit Wasserman wrote:
> On 01/31/2013 05:08 PM, Michael R. hines wrote:
>> Yes, I was hoping for a comment about this in particular (before I send out another patchest with the proper coverletter and so forth).
>>
>> So here's the problem I was experiencing inside savevm.c, qemu_loadvm_state():
>>
>> I was having a little trouble serializing the client/server protocol in my brain.
>>
>> When the server-side begins loading pages into ram, qemu_loadvm_state() goes into a tight loop waiting for each memory page to be transmitted, one-by-one.
>>
>> Now, for RDMA, this loop is not necessary, but the loop was stuck waiting for TCP messages that did not need to be sent. So, the extra flag you saw was a hack to break out of the loop
>>
>> .... but according to you, I should bypass this loop entirely?
>> Should I write a brand new function in savevm.c which skips this function?
> no the pages are not sent one by one but actually are buffered (qemu_put_buffer writes them into a buffer).
> This is done to ensure migration won't exceed it speed limit - i.e bandwidth capping.
> You will need it also for RDMA, as the bandwidth of the RDMA is shared with guests, other migration processes and the host.
>
> You should not bypass the loop as you need to mark pages transferred as clean in the bitmap,
> in order to exit the loop in ram_save_block just set bytes_sent to the page size which is what you are sending on the wire.
> It is also uesd to calculated the amount of data sent during migration.
>> Also, with the QEMUFileRDMA, I have a question: Since RDMA does not require a file-like abstraction of any kind (meaning there is no explicit handshaking during an RDMA transfer), should I really create one of these? Unlike sockets and snapshot files that a typical migration would normally need, an RDMA-based migration doesn't operate this way anymore.
> Not sure I understand you question but you don't have to implement all the ops.
>
> Cheers,
> Orit
>> Thanks for all the comments ....... keep them coming =)
>>
>> - Michael
>>
>>
>> On 01/31/2013 06:04 AM, Orit Wasserman wrote:
>>> Hi Michael,
>>> I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
>>> You don't do any decoding in the destination.
>>>
>>> I would suggest creating a QEMUFileRDMA and moving the write/read code
>>> You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer.
>>>
>>> you also have some white space damage in the beginning of savevm.c.
>>>
>>> Regards,
>>> Orit
>>>
>>> On 01/29/2013 12:01 AM, mrhines@linux.vnet.ibm.com wrote
>>>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>>>
>>>>
>>>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>>>> ---
>>>>    arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
>>>>    include/migration/qemu-file.h |    1 +
>>>>    savevm.c                      |   90 +++++++++++++++++++++++++++-----
>>>>    3 files changed, 189 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/arch_init.c b/arch_init.c
>>>> index dada6de..7633fa6 100644
>>>> --- a/arch_init.c
>>>> +++ b/arch_init.c
>>>> @@ -42,6 +42,7 @@
>>>>    #include "migration/migration.h"
>>>>    #include "exec/gdbstub.h"
>>>>    #include "hw/smbios.h"
>>>> +#include "qemu/rdma.h"
>>>>    #include "exec/address-spaces.h"
>>>>    #include "hw/pcspk.h"
>>>>    #include "migration/page_cache.h"
>>>> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
>>>>    #define RAM_SAVE_FLAG_EOS      0x10
>>>>    #define RAM_SAVE_FLAG_CONTINUE 0x20
>>>>    #define RAM_SAVE_FLAG_XBZRLE   0x40
>>>> +#define RAM_SAVE_FLAG_RDMA     0x80
>>>>      #ifdef __ALTIVEC__
>>>>    #include <altivec.h>
>>>> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>>        int bytes_sent = 0;
>>>>        MemoryRegion *mr;
>>>>        ram_addr_t current_addr;
>>>> +    static int not_sent = 1;
>>>>          if (!block)
>>>>            block = QTAILQ_FIRST(&ram_list.blocks);
>>>> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>>                int cont = (block == last_sent_block) ?
>>>>                    RAM_SAVE_FLAG_CONTINUE : 0;
>>>>    +            current_addr = block->offset + offset;
>>>>                p = memory_region_get_ram_ptr(mr) + offset;
>>>>                  /* In doubt sent page as normal */
>>>>                bytes_sent = -1;
>>>> -            if (is_dup_page(p)) {
>>>> +
>>>> +            /*
>>>> +             * RFC RDMA: The empirical cost of searching for zero pages here
>>>> +             *           plus the cost of communicating with the other side
>>>> +             *           seems to take significantly more time than simply
>>>> +             *           dumping the page into remote memory.
>>>> +             */
>>>> +            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
>>>>                    acct_info.dup_pages++;
>>>>                    bytes_sent = save_block_hdr(f, block, offset, cont,
>>>>                                                RAM_SAVE_FLAG_COMPRESS);
>>>>                    qemu_put_byte(f, *p);
>>>>                    bytes_sent += 1;
>>>> +            /*
>>>> +             * RFC RDMA: Same comment as above. time(run-length encoding)
>>>> +             *           + time(communication) is too big. RDMA throughput tanks
>>>> +             *           when this feature is enabled. But there's no need
>>>> +             *           to change the code since the feature is optional.
>>>> +             */
>>>>                } else if (migrate_use_xbzrle()) {
>>>> -                current_addr = block->offset + offset;
>>>>                    bytes_sent = save_xbzrle_page(f, p, current_addr, block,
>>>>                                                  offset, cont, last_stage);
>>>>                    if (!last_stage) {
>>>>                        p = get_cached_data(XBZRLE.cache, current_addr);
>>>>                    }
>>>> +            } else if (qemu_rdma_migration_enabled()) {
>>>> +                int ret;
>>>> +
>>>> +                /*
>>>> +                 * RFC RDMA: This bad hack was to cause the loop on the
>>>> +                 *           receiving side to break. Comments are welcome
>>>> +                 *           on how to get rid of it.
>>>> +                 */
>>>> +                if (not_sent == 1) {
>>>> +                        not_sent = 0;
>>>> +                        bytes_sent = save_block_hdr(f, block, offset,
>>>> +                                                cont, RAM_SAVE_FLAG_RDMA);
>>>> +                }
>>>> +                acct_info.norm_pages++;
>>>> +                /*
>>>> +                 * use RDMA to send page
>>>> +                 */
>>>> +                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
>>>> +                        TARGET_PAGE_SIZE)) {
>>>> +                    fprintf(stderr, "rdma migration: write error!\n");
>>>> +                    qemu_file_set_error(f, -EIO);
>>>> +                    return 0;
>>>> +                }
>>>> +
>>>> +                /*
>>>> +                 * do some polling
>>>> +                 */
>>>> +                while (1) {
>>>> +                    ret = qemu_rdma_migration_poll(&rdma_mdata);
>>>> +                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
>>>> +                        break;
>>>> +                    }
>>>> +                    if (ret < 0) {
>>>> +                        fprintf(stderr, "rdma migration: polling error!\n");
>>>> +                        qemu_file_set_error(f, -EIO);
>>>> +                        return 0;
>>>> +                    }
>>>> +                }
>>>> +                bytes_sent += TARGET_PAGE_SIZE;
>>>>                }
>>>>                  /* XBZRLE overflow or normal page */
>>>> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>>>        return 0;
>>>>    }
>>>>    +
>>>> +int tprate = 1000;
>>>> +
>>>>    static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>>    {
>>>>        int ret;
>>>>        int i;
>>>> -    int64_t t0;
>>>> -    int total_sent = 0;
>>>> +    int64_t t0, tp0;
>>>> +    int total_sent = 0, last_total_sent = 0;
>>>>          qemu_mutex_lock_ramlist();
>>>>    @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>>                break;
>>>>            }
>>>>            total_sent += bytes_sent;
>>>> +        last_total_sent += bytes_sent;
>>>>            acct_info.iterations++;
>>>>            /* we want to check in the 1st loop, just in case it was the 1st time
>>>>               and we had to sync the dirty bitmap.
>>>>               qemu_get_clock_ns() is a bit expensive, so we only check each some
>>>>               iterations
>>>>            */
>>>> +
>>>> +        /*
>>>> +         * RFC RDMA: Can we have something like this to periodically print
>>>> +         * out throughput.
>>>> +         * This is just a rough-sketch that partially worked for me.
>>>> +         * I assume there a better way that everyone would prefer.
>>>> +         * Perhaps we could set a QMP command that toggled a "periodic printing"
>>>> +         * option that allowed more details to be printed on stdout.....?
>>>> +         */
>>>>            if ((i & 63) == 0) {
>>>> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
>>>> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
>>>> +            uint64_t t1 = (curr - t0) / 1000000;
>>>> +            double tp;
>>>>                if (t1 > MAX_WAIT) {
>>>>                    DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>>>>                            t1, i);
>>>>                    break;
>>>>                }
>>>> +
>>>> +            if ((i % tprate) == 0) {
>>>> +                uint64_t tp1 = (curr - tp0) / 1000000;
>>>> +                tp = ((double) last_total_sent * 8.0 /
>>>> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
>>>> +                printf("throughput: %f mbps\n", tp);
>>>> +                last_total_sent = 0;
>>>> +                tp0 = curr;
>>>> +            }
>>>>            }
>>>>            i++;
>>>>        }
>>>>    +    /* flush buffer write */
>>>> +    if (qemu_rdma_migration_enabled()) {
>>>> +        int resp;
>>>> +        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
>>>> +        if (resp < 0) {
>>>> +            fprintf(stderr, "rdma migration: write flush error!\n");
>>>> +            qemu_file_set_error(f, -EIO);
>>>> +            return 0;
>>>> +        }
>>>> +    }
>>>> +
>>>>        qemu_mutex_unlock_ramlist();
>>>>          if (ret < 0) {
>>>> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>>>>                    ret = -EINVAL;
>>>>                    goto done;
>>>>                }
>>>> +        } else if (flags & RAM_SAVE_FLAG_RDMA) {
>>>> +            /*
>>>> +             * RFC RDMA: This bad hack was to cause the loop break.
>>>> +             *           Comments are welcome on how to get rid of it.
>>>> +             *           Communicating here is unnecessary because the
>>>> +             *           RDMA page has already arrived.
>>>> +             *           Comments are welcome on how to get rif of this.
>>>> +             */
>>>> +            if (!qemu_rdma_migration_enabled()) {
>>>> +                return -EINVAL;
>>>> +            }
>>>> +            void *host = host_from_stream_offset(f, addr, flags);
>>>> +            if (!host) {
>>>> +                return -EINVAL;
>>>> +            }
>>>> +            /* rdma page is already here, nothing to do */
>>>>            }
>>>>            error = qemu_file_get_error(f);
>>>>            if (error) {
>>>> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
>>>> index 68deefb..7c9968e 100644
>>>> --- a/include/migration/qemu-file.h
>>>> +++ b/include/migration/qemu-file.h
>>>> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
>>>>    int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
>>>>    int64_t qemu_file_get_rate_limit(QEMUFile *f);
>>>>    int qemu_file_get_error(QEMUFile *f);
>>>> +void qemu_file_set_error(QEMUFile *f, int ret);
>>>>      static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>>>>    {
>>>> diff --git a/savevm.c b/savevm.c
>>>> index 304d1ef..071196e 100644
>>>> --- a/savevm.c
>>>> +++ b/savevm.c
>>>> @@ -24,6 +24,7 @@
>>>>      #include "config-host.h"
>>>>    #include "qemu-common.h"
>>>> +#include "qemu/rdma.h"
>>>>    #include "hw/hw.h"
>>>>    #include "hw/qdev.h"
>>>>    #include "net/net.h"
>>>> @@ -50,7 +51,7 @@
>>>>    #define ARP_OP_REQUEST_REV 0x3
>>>>      static int announce_self_create(uint8_t *buf,
>>>> -                uint8_t *mac_addr)
>>>> +                                uint8_t *mac_addr)
>>>>    {
>>>>        /* Ethernet header. */
>>>>        memset(buf, 0xff, 6);         /* destination MAC addr */
>>>> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
>>>>            qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
>>>>                           50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
>>>>        } else {
>>>> -        qemu_del_timer(timer);
>>>> -        qemu_free_timer(timer);
>>>> +            qemu_del_timer(timer);
>>>> +            qemu_free_timer(timer);
>>>>        }
>>>>    }
>>>>      void qemu_announce_self(void)
>>>>    {
>>>> -    static QEMUTimer *timer;
>>>> -    timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>>> -    qemu_announce_self_once(&timer);
>>>> +        static QEMUTimer *timer;
>>>> +        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>>> +        qemu_announce_self_once(&timer);
>>>>    }
>>>>      /***********************************************************/
>>>> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>>>>        QEMUFileStdio *s;
>>>>          if (mode == NULL ||
>>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>>> -    mode[1] != 'b' || mode[2] != 0) {
>>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>>            fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
>>>>            return NULL;
>>>>        }
>>>> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>>>>        QEMUFileStdio *s;
>>>>          if (mode == NULL ||
>>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>>> -    mode[1] != 'b' || mode[2] != 0) {
>>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>>            fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
>>>>            return NULL;
>>>>        }
>>>> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>>>>        return f->last_error;
>>>>    }
>>>>    -static void qemu_file_set_error(QEMUFile *f, int ret)
>>>> +void qemu_file_set_error(QEMUFile *f, int ret)
>>>>    {
>>>>        if (f->last_error == 0) {
>>>>            f->last_error = ret;
>>>> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>>    {
>>>>        SaveStateEntry *se;
>>>>        int ret = 1;
>>>> +    static int first_time = 1;
>>>>          QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>>>>            if (!se->ops || !se->ops->save_live_iterate) {
>>>> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>>            }
>>>>        }
>>>>        if (ret != 0) {
>>>> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
>>>> +        /*
>>>> +         * We use two "sync" infiniband messages happen during migration.
>>>> +         * One at the beginning and one at the end, just to be thorough.
>>>> +         * This is the first one.
>>>> +         */
>>>> +        if (first_time && qemu_rdma_migration_enabled()) {
>>>> +            int r;
>>>> +            first_time = 0;
>>>> +            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
>>>> +                fprintf(stderr,
>>>> +                        "rdma migration: error posting extra send sync!\n");
>>>> +                return -EIO;
>>>> +            }
>>>> +
>>>> +            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
>>>> +            if (r < 0) {
>>>> +                fprintf(stderr,
>>>> +                        "rdma migration: qemu_savevm_state_iterate"
>>>> +                        " sync polling error!\n");
>>>> +                return -EIO;
>>>> +            }
>>>> +        }
>>>> +#endif
>>>> +
>>>>            return ret;
>>>>        }
>>>> +
>>>>        ret = qemu_file_get_error(f);
>>>>        if (ret != 0) {
>>>>            qemu_savevm_state_cancel();
>>>> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>>            int len;
>>>>              if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
>>>> -        continue;
>>>> +            continue;
>>>>            }
>>>>            trace_savevm_section_start();
>>>>            /* Section type */
>>>> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>>            trace_savevm_section_end(se->section_id);
>>>>        }
>>>>    +    /*
>>>> +     * We use two "sync" infiniband messages happen during migration.
>>>> +     * One at the beginning and one at the end, just to be thorough.
>>>> +     * This is the second one.
>>>> +     */
>>>> +    if (qemu_rdma_migration_enabled()) {
>>>> +        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>>> +                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
>>>> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
>>>> +            return -EIO;
>>>> +        }
>>>> +    }
>>>> +
>>>>        qemu_put_byte(f, QEMU_VM_EOF);
>>>>    +    /* wait for RDMA sync message to complete */
>>>> +    if (qemu_rdma_migration_enabled()) {
>>>> +        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>> +                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
>>>> +        if (ret < 0) {
>>>> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
>>>> +                            " sync polling error!\n");
>>>> +            return -EIO;
>>>> +        }
>>>> +     }
>>>> +
>>>>        return qemu_file_get_error(f);
>>>>    }
>>>>    @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>>>>          cpu_synchronize_all_post_init();
>>>>    -    ret = 0;
>>>> +    /* wait for RDMA sync message */
>>>> +    if (qemu_rdma_migration_enabled()) {
>>>> +        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>> +                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
>>>> +        if (ret < 0) {
>>>> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
>>>> +                            " sync polling error!\n");
>>>> +            goto out;
>>>> +        }
>>>> +    }
>>>>    +    ret = 0;
>>>>    out:
>>>>        QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
>>>>            QLIST_REMOVE(le, entry);
>>>>
>>
Orit Wasserman Feb. 1, 2013, 10:01 a.m. UTC | #5
On 01/31/2013 09:10 PM, Michael R. hines wrote:
> Sorry, I didn't go into enough detail about the problem I'm having in the loop:
> 
> The loop that's not breaking is inside qemu_loadvm_state(), not ram_save_block().
I understand now, ram_load always read the addr in the begining of the while loop.
You will need to change it if you have RDMA enabled and skip it, than you code will work.

You reminded me that RDMA requires both side to support it, for those kind of stuff
we introduced migration capabilities that the management (libvirt) or the user can verify
that both the source and destination can support it. See migrate_set_capability and info 
migrate_capabilities in the monitor for XBZRLE (compression).

Cheers,
Orit
> 
> This do-while loop is not exiting....... is it necessary for me to maintain this loop for RDMA purposes? Since there is explicit synchronization needed at user-level to transmit an RDMA page?
> 
> Additionally: If I have to write a page into a new QEMUFileRDMA abstraction for bandwidth accounting purposes, this would *significantly* slow down the performance advantages of RDMA.
> 
> Is there a way to do the accounting without doing any additional memory copies?
> 
> (If I'm understanding the abstraction properly)....
> 
> - Michael
> 
> On 01/31/2013 01:56 PM, Orit Wasserman wrote:
>> On 01/31/2013 05:08 PM, Michael R. hines wrote:
>>> Yes, I was hoping for a comment about this in particular (before I send out another patchest with the proper coverletter and so forth).
>>>
>>> So here's the problem I was experiencing inside savevm.c, qemu_loadvm_state():
>>>
>>> I was having a little trouble serializing the client/server protocol in my brain.
>>>
>>> When the server-side begins loading pages into ram, qemu_loadvm_state() goes into a tight loop waiting for each memory page to be transmitted, one-by-one.
>>>
>>> Now, for RDMA, this loop is not necessary, but the loop was stuck waiting for TCP messages that did not need to be sent. So, the extra flag you saw was a hack to break out of the loop
>>>
>>> .... but according to you, I should bypass this loop entirely?
>>> Should I write a brand new function in savevm.c which skips this function?
>> no the pages are not sent one by one but actually are buffered (qemu_put_buffer writes them into a buffer).
>> This is done to ensure migration won't exceed it speed limit - i.e bandwidth capping.
>> You will need it also for RDMA, as the bandwidth of the RDMA is shared with guests, other migration processes and the host.
>>
>> You should not bypass the loop as you need to mark pages transferred as clean in the bitmap,
>> in order to exit the loop in ram_save_block just set bytes_sent to the page size which is what you are sending on the wire.
>> It is also uesd to calculated the amount of data sent during migration.
>>> Also, with the QEMUFileRDMA, I have a question: Since RDMA does not require a file-like abstraction of any kind (meaning there is no explicit handshaking during an RDMA transfer), should I really create one of these? Unlike sockets and snapshot files that a typical migration would normally need, an RDMA-based migration doesn't operate this way anymore.
>> Not sure I understand you question but you don't have to implement all the ops.
>>
>> Cheers,
>> Orit
>>> Thanks for all the comments ....... keep them coming =)
>>>
>>> - Michael
>>>
>>>
>>> On 01/31/2013 06:04 AM, Orit Wasserman wrote:
>>>> Hi Michael,
>>>> I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
>>>> You don't do any decoding in the destination.
>>>>
>>>> I would suggest creating a QEMUFileRDMA and moving the write/read code
>>>> You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer.
>>>>
>>>> you also have some white space damage in the beginning of savevm.c.
>>>>
>>>> Regards,
>>>> Orit
>>>>
>>>> On 01/29/2013 12:01 AM, mrhines@linux.vnet.ibm.com wrote
>>>>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>>>>
>>>>>
>>>>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>>>>> ---
>>>>>    arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
>>>>>    include/migration/qemu-file.h |    1 +
>>>>>    savevm.c                      |   90 +++++++++++++++++++++++++++-----
>>>>>    3 files changed, 189 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/arch_init.c b/arch_init.c
>>>>> index dada6de..7633fa6 100644
>>>>> --- a/arch_init.c
>>>>> +++ b/arch_init.c
>>>>> @@ -42,6 +42,7 @@
>>>>>    #include "migration/migration.h"
>>>>>    #include "exec/gdbstub.h"
>>>>>    #include "hw/smbios.h"
>>>>> +#include "qemu/rdma.h"
>>>>>    #include "exec/address-spaces.h"
>>>>>    #include "hw/pcspk.h"
>>>>>    #include "migration/page_cache.h"
>>>>> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
>>>>>    #define RAM_SAVE_FLAG_EOS      0x10
>>>>>    #define RAM_SAVE_FLAG_CONTINUE 0x20
>>>>>    #define RAM_SAVE_FLAG_XBZRLE   0x40
>>>>> +#define RAM_SAVE_FLAG_RDMA     0x80
>>>>>      #ifdef __ALTIVEC__
>>>>>    #include <altivec.h>
>>>>> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>>>        int bytes_sent = 0;
>>>>>        MemoryRegion *mr;
>>>>>        ram_addr_t current_addr;
>>>>> +    static int not_sent = 1;
>>>>>          if (!block)
>>>>>            block = QTAILQ_FIRST(&ram_list.blocks);
>>>>> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>>>                int cont = (block == last_sent_block) ?
>>>>>                    RAM_SAVE_FLAG_CONTINUE : 0;
>>>>>    +            current_addr = block->offset + offset;
>>>>>                p = memory_region_get_ram_ptr(mr) + offset;
>>>>>                  /* In doubt sent page as normal */
>>>>>                bytes_sent = -1;
>>>>> -            if (is_dup_page(p)) {
>>>>> +
>>>>> +            /*
>>>>> +             * RFC RDMA: The empirical cost of searching for zero pages here
>>>>> +             *           plus the cost of communicating with the other side
>>>>> +             *           seems to take significantly more time than simply
>>>>> +             *           dumping the page into remote memory.
>>>>> +             */
>>>>> +            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
>>>>>                    acct_info.dup_pages++;
>>>>>                    bytes_sent = save_block_hdr(f, block, offset, cont,
>>>>>                                                RAM_SAVE_FLAG_COMPRESS);
>>>>>                    qemu_put_byte(f, *p);
>>>>>                    bytes_sent += 1;
>>>>> +            /*
>>>>> +             * RFC RDMA: Same comment as above. time(run-length encoding)
>>>>> +             *           + time(communication) is too big. RDMA throughput tanks
>>>>> +             *           when this feature is enabled. But there's no need
>>>>> +             *           to change the code since the feature is optional.
>>>>> +             */
>>>>>                } else if (migrate_use_xbzrle()) {
>>>>> -                current_addr = block->offset + offset;
>>>>>                    bytes_sent = save_xbzrle_page(f, p, current_addr, block,
>>>>>                                                  offset, cont, last_stage);
>>>>>                    if (!last_stage) {
>>>>>                        p = get_cached_data(XBZRLE.cache, current_addr);
>>>>>                    }
>>>>> +            } else if (qemu_rdma_migration_enabled()) {
>>>>> +                int ret;
>>>>> +
>>>>> +                /*
>>>>> +                 * RFC RDMA: This bad hack was to cause the loop on the
>>>>> +                 *           receiving side to break. Comments are welcome
>>>>> +                 *           on how to get rid of it.
>>>>> +                 */
>>>>> +                if (not_sent == 1) {
>>>>> +                        not_sent = 0;
>>>>> +                        bytes_sent = save_block_hdr(f, block, offset,
>>>>> +                                                cont, RAM_SAVE_FLAG_RDMA);
>>>>> +                }
>>>>> +                acct_info.norm_pages++;
>>>>> +                /*
>>>>> +                 * use RDMA to send page
>>>>> +                 */
>>>>> +                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
>>>>> +                        TARGET_PAGE_SIZE)) {
>>>>> +                    fprintf(stderr, "rdma migration: write error!\n");
>>>>> +                    qemu_file_set_error(f, -EIO);
>>>>> +                    return 0;
>>>>> +                }
>>>>> +
>>>>> +                /*
>>>>> +                 * do some polling
>>>>> +                 */
>>>>> +                while (1) {
>>>>> +                    ret = qemu_rdma_migration_poll(&rdma_mdata);
>>>>> +                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
>>>>> +                        break;
>>>>> +                    }
>>>>> +                    if (ret < 0) {
>>>>> +                        fprintf(stderr, "rdma migration: polling error!\n");
>>>>> +                        qemu_file_set_error(f, -EIO);
>>>>> +                        return 0;
>>>>> +                    }
>>>>> +                }
>>>>> +                bytes_sent += TARGET_PAGE_SIZE;
>>>>>                }
>>>>>                  /* XBZRLE overflow or normal page */
>>>>> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>>>>        return 0;
>>>>>    }
>>>>>    +
>>>>> +int tprate = 1000;
>>>>> +
>>>>>    static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>>>    {
>>>>>        int ret;
>>>>>        int i;
>>>>> -    int64_t t0;
>>>>> -    int total_sent = 0;
>>>>> +    int64_t t0, tp0;
>>>>> +    int total_sent = 0, last_total_sent = 0;
>>>>>          qemu_mutex_lock_ramlist();
>>>>>    @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>>>                break;
>>>>>            }
>>>>>            total_sent += bytes_sent;
>>>>> +        last_total_sent += bytes_sent;
>>>>>            acct_info.iterations++;
>>>>>            /* we want to check in the 1st loop, just in case it was the 1st time
>>>>>               and we had to sync the dirty bitmap.
>>>>>               qemu_get_clock_ns() is a bit expensive, so we only check each some
>>>>>               iterations
>>>>>            */
>>>>> +
>>>>> +        /*
>>>>> +         * RFC RDMA: Can we have something like this to periodically print
>>>>> +         * out throughput.
>>>>> +         * This is just a rough-sketch that partially worked for me.
>>>>> +         * I assume there a better way that everyone would prefer.
>>>>> +         * Perhaps we could set a QMP command that toggled a "periodic printing"
>>>>> +         * option that allowed more details to be printed on stdout.....?
>>>>> +         */
>>>>>            if ((i & 63) == 0) {
>>>>> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
>>>>> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
>>>>> +            uint64_t t1 = (curr - t0) / 1000000;
>>>>> +            double tp;
>>>>>                if (t1 > MAX_WAIT) {
>>>>>                    DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>>>>>                            t1, i);
>>>>>                    break;
>>>>>                }
>>>>> +
>>>>> +            if ((i % tprate) == 0) {
>>>>> +                uint64_t tp1 = (curr - tp0) / 1000000;
>>>>> +                tp = ((double) last_total_sent * 8.0 /
>>>>> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
>>>>> +                printf("throughput: %f mbps\n", tp);
>>>>> +                last_total_sent = 0;
>>>>> +                tp0 = curr;
>>>>> +            }
>>>>>            }
>>>>>            i++;
>>>>>        }
>>>>>    +    /* flush buffer write */
>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>> +        int resp;
>>>>> +        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
>>>>> +        if (resp < 0) {
>>>>> +            fprintf(stderr, "rdma migration: write flush error!\n");
>>>>> +            qemu_file_set_error(f, -EIO);
>>>>> +            return 0;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>>        qemu_mutex_unlock_ramlist();
>>>>>          if (ret < 0) {
>>>>> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>>>>>                    ret = -EINVAL;
>>>>>                    goto done;
>>>>>                }
>>>>> +        } else if (flags & RAM_SAVE_FLAG_RDMA) {
>>>>> +            /*
>>>>> +             * RFC RDMA: This bad hack was to cause the loop break.
>>>>> +             *           Comments are welcome on how to get rid of it.
>>>>> +             *           Communicating here is unnecessary because the
>>>>> +             *           RDMA page has already arrived.
>>>>> +             *           Comments are welcome on how to get rif of this.
>>>>> +             */
>>>>> +            if (!qemu_rdma_migration_enabled()) {
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +            void *host = host_from_stream_offset(f, addr, flags);
>>>>> +            if (!host) {
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +            /* rdma page is already here, nothing to do */
>>>>>            }
>>>>>            error = qemu_file_get_error(f);
>>>>>            if (error) {
>>>>> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
>>>>> index 68deefb..7c9968e 100644
>>>>> --- a/include/migration/qemu-file.h
>>>>> +++ b/include/migration/qemu-file.h
>>>>> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
>>>>>    int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
>>>>>    int64_t qemu_file_get_rate_limit(QEMUFile *f);
>>>>>    int qemu_file_get_error(QEMUFile *f);
>>>>> +void qemu_file_set_error(QEMUFile *f, int ret);
>>>>>      static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>>>>>    {
>>>>> diff --git a/savevm.c b/savevm.c
>>>>> index 304d1ef..071196e 100644
>>>>> --- a/savevm.c
>>>>> +++ b/savevm.c
>>>>> @@ -24,6 +24,7 @@
>>>>>      #include "config-host.h"
>>>>>    #include "qemu-common.h"
>>>>> +#include "qemu/rdma.h"
>>>>>    #include "hw/hw.h"
>>>>>    #include "hw/qdev.h"
>>>>>    #include "net/net.h"
>>>>> @@ -50,7 +51,7 @@
>>>>>    #define ARP_OP_REQUEST_REV 0x3
>>>>>      static int announce_self_create(uint8_t *buf,
>>>>> -                uint8_t *mac_addr)
>>>>> +                                uint8_t *mac_addr)
>>>>>    {
>>>>>        /* Ethernet header. */
>>>>>        memset(buf, 0xff, 6);         /* destination MAC addr */
>>>>> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
>>>>>            qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
>>>>>                           50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
>>>>>        } else {
>>>>> -        qemu_del_timer(timer);
>>>>> -        qemu_free_timer(timer);
>>>>> +            qemu_del_timer(timer);
>>>>> +            qemu_free_timer(timer);
>>>>>        }
>>>>>    }
>>>>>      void qemu_announce_self(void)
>>>>>    {
>>>>> -    static QEMUTimer *timer;
>>>>> -    timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>>>> -    qemu_announce_self_once(&timer);
>>>>> +        static QEMUTimer *timer;
>>>>> +        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>>>> +        qemu_announce_self_once(&timer);
>>>>>    }
>>>>>      /***********************************************************/
>>>>> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>>>>>        QEMUFileStdio *s;
>>>>>          if (mode == NULL ||
>>>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>>>> -    mode[1] != 'b' || mode[2] != 0) {
>>>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>>>            fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
>>>>>            return NULL;
>>>>>        }
>>>>> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>>>>>        QEMUFileStdio *s;
>>>>>          if (mode == NULL ||
>>>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>>>> -    mode[1] != 'b' || mode[2] != 0) {
>>>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>>>            fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
>>>>>            return NULL;
>>>>>        }
>>>>> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>>>>>        return f->last_error;
>>>>>    }
>>>>>    -static void qemu_file_set_error(QEMUFile *f, int ret)
>>>>> +void qemu_file_set_error(QEMUFile *f, int ret)
>>>>>    {
>>>>>        if (f->last_error == 0) {
>>>>>            f->last_error = ret;
>>>>> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>>>    {
>>>>>        SaveStateEntry *se;
>>>>>        int ret = 1;
>>>>> +    static int first_time = 1;
>>>>>          QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>>>>>            if (!se->ops || !se->ops->save_live_iterate) {
>>>>> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>>>            }
>>>>>        }
>>>>>        if (ret != 0) {
>>>>> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
>>>>> +        /*
>>>>> +         * We use two "sync" infiniband messages happen during migration.
>>>>> +         * One at the beginning and one at the end, just to be thorough.
>>>>> +         * This is the first one.
>>>>> +         */
>>>>> +        if (first_time && qemu_rdma_migration_enabled()) {
>>>>> +            int r;
>>>>> +            first_time = 0;
>>>>> +            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
>>>>> +                fprintf(stderr,
>>>>> +                        "rdma migration: error posting extra send sync!\n");
>>>>> +                return -EIO;
>>>>> +            }
>>>>> +
>>>>> +            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
>>>>> +            if (r < 0) {
>>>>> +                fprintf(stderr,
>>>>> +                        "rdma migration: qemu_savevm_state_iterate"
>>>>> +                        " sync polling error!\n");
>>>>> +                return -EIO;
>>>>> +            }
>>>>> +        }
>>>>> +#endif
>>>>> +
>>>>>            return ret;
>>>>>        }
>>>>> +
>>>>>        ret = qemu_file_get_error(f);
>>>>>        if (ret != 0) {
>>>>>            qemu_savevm_state_cancel();
>>>>> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>>>            int len;
>>>>>              if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
>>>>> -        continue;
>>>>> +            continue;
>>>>>            }
>>>>>            trace_savevm_section_start();
>>>>>            /* Section type */
>>>>> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>>>            trace_savevm_section_end(se->section_id);
>>>>>        }
>>>>>    +    /*
>>>>> +     * We use two "sync" infiniband messages happen during migration.
>>>>> +     * One at the beginning and one at the end, just to be thorough.
>>>>> +     * This is the second one.
>>>>> +     */
>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>> +        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>>>> +                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
>>>>> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
>>>>> +            return -EIO;
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>>        qemu_put_byte(f, QEMU_VM_EOF);
>>>>>    +    /* wait for RDMA sync message to complete */
>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>> +        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>>> +                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
>>>>> +        if (ret < 0) {
>>>>> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
>>>>> +                            " sync polling error!\n");
>>>>> +            return -EIO;
>>>>> +        }
>>>>> +     }
>>>>> +
>>>>>        return qemu_file_get_error(f);
>>>>>    }
>>>>>    @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>>>>>          cpu_synchronize_all_post_init();
>>>>>    -    ret = 0;
>>>>> +    /* wait for RDMA sync message */
>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>> +        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>>> +                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
>>>>> +        if (ret < 0) {
>>>>> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
>>>>> +                            " sync polling error!\n");
>>>>> +            goto out;
>>>>> +        }
>>>>> +    }
>>>>>    +    ret = 0;
>>>>>    out:
>>>>>        QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
>>>>>            QLIST_REMOVE(le, entry);
>>>>>
>>>
>
mrhines@linux.vnet.ibm.com Feb. 1, 2013, 4:55 p.m. UTC | #6
Excellent, thank you.

On 02/01/2013 05:01 AM, Orit Wasserman wrote:
> On 01/31/2013 09:10 PM, Michael R. hines wrote:
>> Sorry, I didn't go into enough detail about the problem I'm having in the loop:
>>
>> The loop that's not breaking is inside qemu_loadvm_state(), not ram_save_block().
> I understand now, ram_load always read the addr in the begining of the while loop.
> You will need to change it if you have RDMA enabled and skip it, than you code will work.
>
> You reminded me that RDMA requires both side to support it, for those kind of stuff
> we introduced migration capabilities that the management (libvirt) or the user can verify
> that both the source and destination can support it. See migrate_set_capability and info
> migrate_capabilities in the monitor for XBZRLE (compression).
>
> Cheers,
> Orit
>> This do-while loop is not exiting....... is it necessary for me to maintain this loop for RDMA purposes? Since there is explicit synchronization needed at user-level to transmit an RDMA page?
>>
>> Additionally: If I have to write a page into a new QEMUFileRDMA abstraction for bandwidth accounting purposes, this would *significantly* slow down the performance advantages of RDMA.
>>
>> Is there a way to do the accounting without doing any additional memory copies?
>>
>> (If I'm understanding the abstraction properly)....
>>
>> - Michael
>>
>> On 01/31/2013 01:56 PM, Orit Wasserman wrote:
>>> On 01/31/2013 05:08 PM, Michael R. hines wrote:
>>>> Yes, I was hoping for a comment about this in particular (before I send out another patchest with the proper coverletter and so forth).
>>>>
>>>> So here's the problem I was experiencing inside savevm.c, qemu_loadvm_state():
>>>>
>>>> I was having a little trouble serializing the client/server protocol in my brain.
>>>>
>>>> When the server-side begins loading pages into ram, qemu_loadvm_state() goes into a tight loop waiting for each memory page to be transmitted, one-by-one.
>>>>
>>>> Now, for RDMA, this loop is not necessary, but the loop was stuck waiting for TCP messages that did not need to be sent. So, the extra flag you saw was a hack to break out of the loop
>>>>
>>>> .... but according to you, I should bypass this loop entirely?
>>>> Should I write a brand new function in savevm.c which skips this function?
>>> no the pages are not sent one by one but actually are buffered (qemu_put_buffer writes them into a buffer).
>>> This is done to ensure migration won't exceed it speed limit - i.e bandwidth capping.
>>> You will need it also for RDMA, as the bandwidth of the RDMA is shared with guests, other migration processes and the host.
>>>
>>> You should not bypass the loop as you need to mark pages transferred as clean in the bitmap,
>>> in order to exit the loop in ram_save_block just set bytes_sent to the page size which is what you are sending on the wire.
>>> It is also uesd to calculated the amount of data sent during migration.
>>>> Also, with the QEMUFileRDMA, I have a question: Since RDMA does not require a file-like abstraction of any kind (meaning there is no explicit handshaking during an RDMA transfer), should I really create one of these? Unlike sockets and snapshot files that a typical migration would normally need, an RDMA-based migration doesn't operate this way anymore.
>>> Not sure I understand you question but you don't have to implement all the ops.
>>>
>>> Cheers,
>>> Orit
>>>> Thanks for all the comments ....... keep them coming =)
>>>>
>>>> - Michael
>>>>
>>>>
>>>> On 01/31/2013 06:04 AM, Orit Wasserman wrote:
>>>>> Hi Michael,
>>>>> I maybe missing something here but why do you need a RAM_SAVE_FLAG_RDMA flag?
>>>>> You don't do any decoding in the destination.
>>>>>
>>>>> I would suggest creating a QEMUFileRDMA and moving the write/read code
>>>>> You can either add a new rdma_buffer QEMUFileOps or add the address to put_buffer.
>>>>>
>>>>> you also have some white space damage in the beginning of savevm.c.
>>>>>
>>>>> Regards,
>>>>> Orit
>>>>>
>>>>> On 01/29/2013 12:01 AM, mrhines@linux.vnet.ibm.com wrote
>>>>>> From: "Michael R. Hines" <mrhines@us.ibm.com>
>>>>>>
>>>>>>
>>>>>> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
>>>>>> ---
>>>>>>     arch_init.c                   |  116 +++++++++++++++++++++++++++++++++++++++--
>>>>>>     include/migration/qemu-file.h |    1 +
>>>>>>     savevm.c                      |   90 +++++++++++++++++++++++++++-----
>>>>>>     3 files changed, 189 insertions(+), 18 deletions(-)
>>>>>>
>>>>>> diff --git a/arch_init.c b/arch_init.c
>>>>>> index dada6de..7633fa6 100644
>>>>>> --- a/arch_init.c
>>>>>> +++ b/arch_init.c
>>>>>> @@ -42,6 +42,7 @@
>>>>>>     #include "migration/migration.h"
>>>>>>     #include "exec/gdbstub.h"
>>>>>>     #include "hw/smbios.h"
>>>>>> +#include "qemu/rdma.h"
>>>>>>     #include "exec/address-spaces.h"
>>>>>>     #include "hw/pcspk.h"
>>>>>>     #include "migration/page_cache.h"
>>>>>> @@ -113,6 +114,7 @@ const uint32_t arch_type = QEMU_ARCH;
>>>>>>     #define RAM_SAVE_FLAG_EOS      0x10
>>>>>>     #define RAM_SAVE_FLAG_CONTINUE 0x20
>>>>>>     #define RAM_SAVE_FLAG_XBZRLE   0x40
>>>>>> +#define RAM_SAVE_FLAG_RDMA     0x80
>>>>>>       #ifdef __ALTIVEC__
>>>>>>     #include <altivec.h>
>>>>>> @@ -434,6 +436,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>>>>         int bytes_sent = 0;
>>>>>>         MemoryRegion *mr;
>>>>>>         ram_addr_t current_addr;
>>>>>> +    static int not_sent = 1;
>>>>>>           if (!block)
>>>>>>             block = QTAILQ_FIRST(&ram_list.blocks);
>>>>>> @@ -457,23 +460,75 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
>>>>>>                 int cont = (block == last_sent_block) ?
>>>>>>                     RAM_SAVE_FLAG_CONTINUE : 0;
>>>>>>     +            current_addr = block->offset + offset;
>>>>>>                 p = memory_region_get_ram_ptr(mr) + offset;
>>>>>>                   /* In doubt sent page as normal */
>>>>>>                 bytes_sent = -1;
>>>>>> -            if (is_dup_page(p)) {
>>>>>> +
>>>>>> +            /*
>>>>>> +             * RFC RDMA: The empirical cost of searching for zero pages here
>>>>>> +             *           plus the cost of communicating with the other side
>>>>>> +             *           seems to take significantly more time than simply
>>>>>> +             *           dumping the page into remote memory.
>>>>>> +             */
>>>>>> +            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
>>>>>>                     acct_info.dup_pages++;
>>>>>>                     bytes_sent = save_block_hdr(f, block, offset, cont,
>>>>>>                                                 RAM_SAVE_FLAG_COMPRESS);
>>>>>>                     qemu_put_byte(f, *p);
>>>>>>                     bytes_sent += 1;
>>>>>> +            /*
>>>>>> +             * RFC RDMA: Same comment as above. time(run-length encoding)
>>>>>> +             *           + time(communication) is too big. RDMA throughput tanks
>>>>>> +             *           when this feature is enabled. But there's no need
>>>>>> +             *           to change the code since the feature is optional.
>>>>>> +             */
>>>>>>                 } else if (migrate_use_xbzrle()) {
>>>>>> -                current_addr = block->offset + offset;
>>>>>>                     bytes_sent = save_xbzrle_page(f, p, current_addr, block,
>>>>>>                                                   offset, cont, last_stage);
>>>>>>                     if (!last_stage) {
>>>>>>                         p = get_cached_data(XBZRLE.cache, current_addr);
>>>>>>                     }
>>>>>> +            } else if (qemu_rdma_migration_enabled()) {
>>>>>> +                int ret;
>>>>>> +
>>>>>> +                /*
>>>>>> +                 * RFC RDMA: This bad hack was to cause the loop on the
>>>>>> +                 *           receiving side to break. Comments are welcome
>>>>>> +                 *           on how to get rid of it.
>>>>>> +                 */
>>>>>> +                if (not_sent == 1) {
>>>>>> +                        not_sent = 0;
>>>>>> +                        bytes_sent = save_block_hdr(f, block, offset,
>>>>>> +                                                cont, RAM_SAVE_FLAG_RDMA);
>>>>>> +                }
>>>>>> +                acct_info.norm_pages++;
>>>>>> +                /*
>>>>>> +                 * use RDMA to send page
>>>>>> +                 */
>>>>>> +                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
>>>>>> +                        TARGET_PAGE_SIZE)) {
>>>>>> +                    fprintf(stderr, "rdma migration: write error!\n");
>>>>>> +                    qemu_file_set_error(f, -EIO);
>>>>>> +                    return 0;
>>>>>> +                }
>>>>>> +
>>>>>> +                /*
>>>>>> +                 * do some polling
>>>>>> +                 */
>>>>>> +                while (1) {
>>>>>> +                    ret = qemu_rdma_migration_poll(&rdma_mdata);
>>>>>> +                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
>>>>>> +                        break;
>>>>>> +                    }
>>>>>> +                    if (ret < 0) {
>>>>>> +                        fprintf(stderr, "rdma migration: polling error!\n");
>>>>>> +                        qemu_file_set_error(f, -EIO);
>>>>>> +                        return 0;
>>>>>> +                    }
>>>>>> +                }
>>>>>> +                bytes_sent += TARGET_PAGE_SIZE;
>>>>>>                 }
>>>>>>                   /* XBZRLE overflow or normal page */
>>>>>> @@ -601,12 +656,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>>>>>         return 0;
>>>>>>     }
>>>>>>     +
>>>>>> +int tprate = 1000;
>>>>>> +
>>>>>>     static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>>>>     {
>>>>>>         int ret;
>>>>>>         int i;
>>>>>> -    int64_t t0;
>>>>>> -    int total_sent = 0;
>>>>>> +    int64_t t0, tp0;
>>>>>> +    int total_sent = 0, last_total_sent = 0;
>>>>>>           qemu_mutex_lock_ramlist();
>>>>>>     @@ -625,23 +683,55 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
>>>>>>                 break;
>>>>>>             }
>>>>>>             total_sent += bytes_sent;
>>>>>> +        last_total_sent += bytes_sent;
>>>>>>             acct_info.iterations++;
>>>>>>             /* we want to check in the 1st loop, just in case it was the 1st time
>>>>>>                and we had to sync the dirty bitmap.
>>>>>>                qemu_get_clock_ns() is a bit expensive, so we only check each some
>>>>>>                iterations
>>>>>>             */
>>>>>> +
>>>>>> +        /*
>>>>>> +         * RFC RDMA: Can we have something like this to periodically print
>>>>>> +         * out throughput.
>>>>>> +         * This is just a rough-sketch that partially worked for me.
>>>>>> +         * I assume there a better way that everyone would prefer.
>>>>>> +         * Perhaps we could set a QMP command that toggled a "periodic printing"
>>>>>> +         * option that allowed more details to be printed on stdout.....?
>>>>>> +         */
>>>>>>             if ((i & 63) == 0) {
>>>>>> -            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
>>>>>> +            uint64_t curr = qemu_get_clock_ns(rt_clock);
>>>>>> +            uint64_t t1 = (curr - t0) / 1000000;
>>>>>> +            double tp;
>>>>>>                 if (t1 > MAX_WAIT) {
>>>>>>                     DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
>>>>>>                             t1, i);
>>>>>>                     break;
>>>>>>                 }
>>>>>> +
>>>>>> +            if ((i % tprate) == 0) {
>>>>>> +                uint64_t tp1 = (curr - tp0) / 1000000;
>>>>>> +                tp = ((double) last_total_sent * 8.0 /
>>>>>> +                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
>>>>>> +                printf("throughput: %f mbps\n", tp);
>>>>>> +                last_total_sent = 0;
>>>>>> +                tp0 = curr;
>>>>>> +            }
>>>>>>             }
>>>>>>             i++;
>>>>>>         }
>>>>>>     +    /* flush buffer write */
>>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>>> +        int resp;
>>>>>> +        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
>>>>>> +        if (resp < 0) {
>>>>>> +            fprintf(stderr, "rdma migration: write flush error!\n");
>>>>>> +            qemu_file_set_error(f, -EIO);
>>>>>> +            return 0;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>>         qemu_mutex_unlock_ramlist();
>>>>>>           if (ret < 0) {
>>>>>> @@ -863,6 +953,22 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
>>>>>>                     ret = -EINVAL;
>>>>>>                     goto done;
>>>>>>                 }
>>>>>> +        } else if (flags & RAM_SAVE_FLAG_RDMA) {
>>>>>> +            /*
>>>>>> +             * RFC RDMA: This bad hack was to cause the loop break.
>>>>>> +             *           Comments are welcome on how to get rid of it.
>>>>>> +             *           Communicating here is unnecessary because the
>>>>>> +             *           RDMA page has already arrived.
>>>>>> +             *           Comments are welcome on how to get rif of this.
>>>>>> +             */
>>>>>> +            if (!qemu_rdma_migration_enabled()) {
>>>>>> +                return -EINVAL;
>>>>>> +            }
>>>>>> +            void *host = host_from_stream_offset(f, addr, flags);
>>>>>> +            if (!host) {
>>>>>> +                return -EINVAL;
>>>>>> +            }
>>>>>> +            /* rdma page is already here, nothing to do */
>>>>>>             }
>>>>>>             error = qemu_file_get_error(f);
>>>>>>             if (error) {
>>>>>> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
>>>>>> index 68deefb..7c9968e 100644
>>>>>> --- a/include/migration/qemu-file.h
>>>>>> +++ b/include/migration/qemu-file.h
>>>>>> @@ -112,6 +112,7 @@ int qemu_file_rate_limit(QEMUFile *f);
>>>>>>     int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
>>>>>>     int64_t qemu_file_get_rate_limit(QEMUFile *f);
>>>>>>     int qemu_file_get_error(QEMUFile *f);
>>>>>> +void qemu_file_set_error(QEMUFile *f, int ret);
>>>>>>       static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
>>>>>>     {
>>>>>> diff --git a/savevm.c b/savevm.c
>>>>>> index 304d1ef..071196e 100644
>>>>>> --- a/savevm.c
>>>>>> +++ b/savevm.c
>>>>>> @@ -24,6 +24,7 @@
>>>>>>       #include "config-host.h"
>>>>>>     #include "qemu-common.h"
>>>>>> +#include "qemu/rdma.h"
>>>>>>     #include "hw/hw.h"
>>>>>>     #include "hw/qdev.h"
>>>>>>     #include "net/net.h"
>>>>>> @@ -50,7 +51,7 @@
>>>>>>     #define ARP_OP_REQUEST_REV 0x3
>>>>>>       static int announce_self_create(uint8_t *buf,
>>>>>> -                uint8_t *mac_addr)
>>>>>> +                                uint8_t *mac_addr)
>>>>>>     {
>>>>>>         /* Ethernet header. */
>>>>>>         memset(buf, 0xff, 6);         /* destination MAC addr */
>>>>>> @@ -97,16 +98,16 @@ static void qemu_announce_self_once(void *opaque)
>>>>>>             qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
>>>>>>                            50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
>>>>>>         } else {
>>>>>> -        qemu_del_timer(timer);
>>>>>> -        qemu_free_timer(timer);
>>>>>> +            qemu_del_timer(timer);
>>>>>> +            qemu_free_timer(timer);
>>>>>>         }
>>>>>>     }
>>>>>>       void qemu_announce_self(void)
>>>>>>     {
>>>>>> -    static QEMUTimer *timer;
>>>>>> -    timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>>>>> -    qemu_announce_self_once(&timer);
>>>>>> +        static QEMUTimer *timer;
>>>>>> +        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
>>>>>> +        qemu_announce_self_once(&timer);
>>>>>>     }
>>>>>>       /***********************************************************/
>>>>>> @@ -299,8 +300,8 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>>>>>>         QEMUFileStdio *s;
>>>>>>           if (mode == NULL ||
>>>>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>>>>> -    mode[1] != 'b' || mode[2] != 0) {
>>>>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>>>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>>>>             fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
>>>>>>             return NULL;
>>>>>>         }
>>>>>> @@ -342,8 +343,8 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>>>>>>         QEMUFileStdio *s;
>>>>>>           if (mode == NULL ||
>>>>>> -    (mode[0] != 'r' && mode[0] != 'w') ||
>>>>>> -    mode[1] != 'b' || mode[2] != 0) {
>>>>>> +        (mode[0] != 'r' && mode[0] != 'w') ||
>>>>>> +        mode[1] != 'b' || mode[2] != 0) {
>>>>>>             fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
>>>>>>             return NULL;
>>>>>>         }
>>>>>> @@ -417,7 +418,7 @@ int qemu_file_get_error(QEMUFile *f)
>>>>>>         return f->last_error;
>>>>>>     }
>>>>>>     -static void qemu_file_set_error(QEMUFile *f, int ret)
>>>>>> +void qemu_file_set_error(QEMUFile *f, int ret)
>>>>>>     {
>>>>>>         if (f->last_error == 0) {
>>>>>>             f->last_error = ret;
>>>>>> @@ -1613,6 +1614,7 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>>>>     {
>>>>>>         SaveStateEntry *se;
>>>>>>         int ret = 1;
>>>>>> +    static int first_time = 1;
>>>>>>           QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>>>>>>             if (!se->ops || !se->ops->save_live_iterate) {
>>>>>> @@ -1643,8 +1645,36 @@ int qemu_savevm_state_iterate(QEMUFile *f)
>>>>>>             }
>>>>>>         }
>>>>>>         if (ret != 0) {
>>>>>> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
>>>>>> +        /*
>>>>>> +         * We use two "sync" infiniband messages happen during migration.
>>>>>> +         * One at the beginning and one at the end, just to be thorough.
>>>>>> +         * This is the first one.
>>>>>> +         */
>>>>>> +        if (first_time && qemu_rdma_migration_enabled()) {
>>>>>> +            int r;
>>>>>> +            first_time = 0;
>>>>>> +            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>>>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
>>>>>> +                fprintf(stderr,
>>>>>> +                        "rdma migration: error posting extra send sync!\n");
>>>>>> +                return -EIO;
>>>>>> +            }
>>>>>> +
>>>>>> +            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>>>> +                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
>>>>>> +            if (r < 0) {
>>>>>> +                fprintf(stderr,
>>>>>> +                        "rdma migration: qemu_savevm_state_iterate"
>>>>>> +                        " sync polling error!\n");
>>>>>> +                return -EIO;
>>>>>> +            }
>>>>>> +        }
>>>>>> +#endif
>>>>>> +
>>>>>>             return ret;
>>>>>>         }
>>>>>> +
>>>>>>         ret = qemu_file_get_error(f);
>>>>>>         if (ret != 0) {
>>>>>>             qemu_savevm_state_cancel();
>>>>>> @@ -1684,7 +1714,7 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>>>>             int len;
>>>>>>               if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
>>>>>> -        continue;
>>>>>> +            continue;
>>>>>>             }
>>>>>>             trace_savevm_section_start();
>>>>>>             /* Section type */
>>>>>> @@ -1703,8 +1733,32 @@ int qemu_savevm_state_complete(QEMUFile *f)
>>>>>>             trace_savevm_section_end(se->section_id);
>>>>>>         }
>>>>>>     +    /*
>>>>>> +     * We use two "sync" infiniband messages happen during migration.
>>>>>> +     * One at the beginning and one at the end, just to be thorough.
>>>>>> +     * This is the second one.
>>>>>> +     */
>>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>>> +        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
>>>>>> +                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
>>>>>> +            fprintf(stderr, "rdma migration: error posting send sync!\n");
>>>>>> +            return -EIO;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>>         qemu_put_byte(f, QEMU_VM_EOF);
>>>>>>     +    /* wait for RDMA sync message to complete */
>>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>>> +        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>>>> +                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
>>>>>> +        if (ret < 0) {
>>>>>> +            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
>>>>>> +                            " sync polling error!\n");
>>>>>> +            return -EIO;
>>>>>> +        }
>>>>>> +     }
>>>>>> +
>>>>>>         return qemu_file_get_error(f);
>>>>>>     }
>>>>>>     @@ -2014,8 +2068,18 @@ int qemu_loadvm_state(QEMUFile *f)
>>>>>>           cpu_synchronize_all_post_init();
>>>>>>     -    ret = 0;
>>>>>> +    /* wait for RDMA sync message */
>>>>>> +    if (qemu_rdma_migration_enabled()) {
>>>>>> +        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
>>>>>> +                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
>>>>>> +        if (ret < 0) {
>>>>>> +            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
>>>>>> +                            " sync polling error!\n");
>>>>>> +            goto out;
>>>>>> +        }
>>>>>> +    }
>>>>>>     +    ret = 0;
>>>>>>     out:
>>>>>>         QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
>>>>>>             QLIST_REMOVE(le, entry);
>>>>>>
diff mbox

Patch

diff --git a/arch_init.c b/arch_init.c
index dada6de..7633fa6 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -42,6 +42,7 @@ 
 #include "migration/migration.h"
 #include "exec/gdbstub.h"
 #include "hw/smbios.h"
+#include "qemu/rdma.h"
 #include "exec/address-spaces.h"
 #include "hw/pcspk.h"
 #include "migration/page_cache.h"
@@ -113,6 +114,7 @@  const uint32_t arch_type = QEMU_ARCH;
 #define RAM_SAVE_FLAG_EOS      0x10
 #define RAM_SAVE_FLAG_CONTINUE 0x20
 #define RAM_SAVE_FLAG_XBZRLE   0x40
+#define RAM_SAVE_FLAG_RDMA     0x80
 
 #ifdef __ALTIVEC__
 #include <altivec.h>
@@ -434,6 +436,7 @@  static int ram_save_block(QEMUFile *f, bool last_stage)
     int bytes_sent = 0;
     MemoryRegion *mr;
     ram_addr_t current_addr;
+    static int not_sent = 1;
 
     if (!block)
         block = QTAILQ_FIRST(&ram_list.blocks);
@@ -457,23 +460,75 @@  static int ram_save_block(QEMUFile *f, bool last_stage)
             int cont = (block == last_sent_block) ?
                 RAM_SAVE_FLAG_CONTINUE : 0;
 
+            current_addr = block->offset + offset;
             p = memory_region_get_ram_ptr(mr) + offset;
 
             /* In doubt sent page as normal */
             bytes_sent = -1;
-            if (is_dup_page(p)) {
+
+            /*
+             * RFC RDMA: The empirical cost of searching for zero pages here
+             *           plus the cost of communicating with the other side
+             *           seems to take significantly more time than simply
+             *           dumping the page into remote memory.
+             */
+            if (!qemu_rdma_migration_enabled() && is_dup_page(p)) {
                 acct_info.dup_pages++;
                 bytes_sent = save_block_hdr(f, block, offset, cont,
                                             RAM_SAVE_FLAG_COMPRESS);
                 qemu_put_byte(f, *p);
                 bytes_sent += 1;
+            /*
+             * RFC RDMA: Same comment as above. time(run-length encoding)
+             *           + time(communication) is too big. RDMA throughput tanks
+             *           when this feature is enabled. But there's no need
+             *           to change the code since the feature is optional.
+             */
             } else if (migrate_use_xbzrle()) {
-                current_addr = block->offset + offset;
                 bytes_sent = save_xbzrle_page(f, p, current_addr, block,
                                               offset, cont, last_stage);
                 if (!last_stage) {
                     p = get_cached_data(XBZRLE.cache, current_addr);
                 }
+            } else if (qemu_rdma_migration_enabled()) {
+                int ret;
+
+                /*
+                 * RFC RDMA: This bad hack was to cause the loop on the
+                 *           receiving side to break. Comments are welcome
+                 *           on how to get rid of it.
+                 */
+                if (not_sent == 1) {
+                        not_sent = 0;
+                        bytes_sent = save_block_hdr(f, block, offset,
+                                                cont, RAM_SAVE_FLAG_RDMA);
+                }
+                acct_info.norm_pages++;
+                /*
+                 * use RDMA to send page
+                 */
+                if (qemu_rdma_migration_write(&rdma_mdata, current_addr,
+                        TARGET_PAGE_SIZE)) {
+                    fprintf(stderr, "rdma migration: write error!\n");
+                    qemu_file_set_error(f, -EIO);
+                    return 0;
+                }
+
+                /*
+                 * do some polling
+                 */
+                while (1) {
+                    ret = qemu_rdma_migration_poll(&rdma_mdata);
+                    if (ret == QEMU_RDMA_MIGRATION_WRID_NONE) {
+                        break;
+                    }
+                    if (ret < 0) {
+                        fprintf(stderr, "rdma migration: polling error!\n");
+                        qemu_file_set_error(f, -EIO);
+                        return 0;
+                    }
+                }
+                bytes_sent += TARGET_PAGE_SIZE;
             }
 
             /* XBZRLE overflow or normal page */
@@ -601,12 +656,15 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     return 0;
 }
 
+
+int tprate = 1000;
+
 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
     int ret;
     int i;
-    int64_t t0;
-    int total_sent = 0;
+    int64_t t0, tp0;
+    int total_sent = 0, last_total_sent = 0;
 
     qemu_mutex_lock_ramlist();
 
@@ -625,23 +683,55 @@  static int ram_save_iterate(QEMUFile *f, void *opaque)
             break;
         }
         total_sent += bytes_sent;
+        last_total_sent += bytes_sent;
         acct_info.iterations++;
         /* we want to check in the 1st loop, just in case it was the 1st time
            and we had to sync the dirty bitmap.
            qemu_get_clock_ns() is a bit expensive, so we only check each some
            iterations
         */
+
+        /*
+         * RFC RDMA: Can we have something like this to periodically print
+         * out throughput.
+         * This is just a rough-sketch that partially worked for me.
+         * I assume there a better way that everyone would prefer.
+         * Perhaps we could set a QMP command that toggled a "periodic printing"
+         * option that allowed more details to be printed on stdout.....?
+         */
         if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
+            uint64_t curr = qemu_get_clock_ns(rt_clock);
+            uint64_t t1 = (curr - t0) / 1000000;
+            double tp;
             if (t1 > MAX_WAIT) {
                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
                 break;
             }
+
+            if ((i % tprate) == 0) {
+                uint64_t tp1 = (curr - tp0) / 1000000;
+                tp = ((double) last_total_sent * 8.0 /
+                               ((double) tp1 / 1000.0)) / 1000.0 / 1000.0;
+                printf("throughput: %f mbps\n", tp);
+                last_total_sent = 0;
+                tp0 = curr;
+            }
         }
         i++;
     }
 
+    /* flush buffer write */
+    if (qemu_rdma_migration_enabled()) {
+        int resp;
+        resp = qemu_rdma_migration_write_flush(&rdma_mdata);
+        if (resp < 0) {
+            fprintf(stderr, "rdma migration: write flush error!\n");
+            qemu_file_set_error(f, -EIO);
+            return 0;
+        }
+    }
+
     qemu_mutex_unlock_ramlist();
 
     if (ret < 0) {
@@ -863,6 +953,22 @@  static int ram_load(QEMUFile *f, void *opaque, int version_id)
                 ret = -EINVAL;
                 goto done;
             }
+        } else if (flags & RAM_SAVE_FLAG_RDMA) {
+            /*
+             * RFC RDMA: This bad hack was to cause the loop break.
+             *           Comments are welcome on how to get rid of it.
+             *           Communicating here is unnecessary because the
+             *           RDMA page has already arrived.
+             *           Comments are welcome on how to get rif of this.
+             */
+            if (!qemu_rdma_migration_enabled()) {
+                return -EINVAL;
+            }
+            void *host = host_from_stream_offset(f, addr, flags);
+            if (!host) {
+                return -EINVAL;
+            }
+            /* rdma page is already here, nothing to do */
         }
         error = qemu_file_get_error(f);
         if (error) {
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 68deefb..7c9968e 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -112,6 +112,7 @@  int qemu_file_rate_limit(QEMUFile *f);
 int64_t qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
 int64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error(QEMUFile *f);
+void qemu_file_set_error(QEMUFile *f, int ret);
 
 static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
 {
diff --git a/savevm.c b/savevm.c
index 304d1ef..071196e 100644
--- a/savevm.c
+++ b/savevm.c
@@ -24,6 +24,7 @@ 
 
 #include "config-host.h"
 #include "qemu-common.h"
+#include "qemu/rdma.h"
 #include "hw/hw.h"
 #include "hw/qdev.h"
 #include "net/net.h"
@@ -50,7 +51,7 @@ 
 #define ARP_OP_REQUEST_REV 0x3
 
 static int announce_self_create(uint8_t *buf,
-				uint8_t *mac_addr)
+                                uint8_t *mac_addr)
 {
     /* Ethernet header. */
     memset(buf, 0xff, 6);         /* destination MAC addr */
@@ -97,16 +98,16 @@  static void qemu_announce_self_once(void *opaque)
         qemu_mod_timer(timer, qemu_get_clock_ms(rt_clock) +
                        50 + (SELF_ANNOUNCE_ROUNDS - count - 1) * 100);
     } else {
-	    qemu_del_timer(timer);
-	    qemu_free_timer(timer);
+            qemu_del_timer(timer);
+            qemu_free_timer(timer);
     }
 }
 
 void qemu_announce_self(void)
 {
-	static QEMUTimer *timer;
-	timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
-	qemu_announce_self_once(&timer);
+        static QEMUTimer *timer;
+        timer = qemu_new_timer_ms(rt_clock, qemu_announce_self_once, &timer);
+        qemu_announce_self_once(&timer);
 }
 
 /***********************************************************/
@@ -299,8 +300,8 @@  QEMUFile *qemu_fdopen(int fd, const char *mode)
     QEMUFileStdio *s;
 
     if (mode == NULL ||
-	(mode[0] != 'r' && mode[0] != 'w') ||
-	mode[1] != 'b' || mode[2] != 0) {
+        (mode[0] != 'r' && mode[0] != 'w') ||
+        mode[1] != 'b' || mode[2] != 0) {
         fprintf(stderr, "qemu_fdopen: Argument validity check failed\n");
         return NULL;
     }
@@ -342,8 +343,8 @@  QEMUFile *qemu_fopen(const char *filename, const char *mode)
     QEMUFileStdio *s;
 
     if (mode == NULL ||
-	(mode[0] != 'r' && mode[0] != 'w') ||
-	mode[1] != 'b' || mode[2] != 0) {
+        (mode[0] != 'r' && mode[0] != 'w') ||
+        mode[1] != 'b' || mode[2] != 0) {
         fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
         return NULL;
     }
@@ -417,7 +418,7 @@  int qemu_file_get_error(QEMUFile *f)
     return f->last_error;
 }
 
-static void qemu_file_set_error(QEMUFile *f, int ret)
+void qemu_file_set_error(QEMUFile *f, int ret)
 {
     if (f->last_error == 0) {
         f->last_error = ret;
@@ -1613,6 +1614,7 @@  int qemu_savevm_state_iterate(QEMUFile *f)
 {
     SaveStateEntry *se;
     int ret = 1;
+    static int first_time = 1;
 
     QTAILQ_FOREACH(se, &savevm_handlers, entry) {
         if (!se->ops || !se->ops->save_live_iterate) {
@@ -1643,8 +1645,36 @@  int qemu_savevm_state_iterate(QEMUFile *f)
         }
     }
     if (ret != 0) {
+#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC
+        /*
+         * We use two "sync" infiniband messages happen during migration.
+         * One at the beginning and one at the end, just to be thorough.
+         * This is the first one.
+         */
+        if (first_time && qemu_rdma_migration_enabled()) {
+            int r;
+            first_time = 0;
+            if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
+                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC)) {
+                fprintf(stderr,
+                        "rdma migration: error posting extra send sync!\n");
+                return -EIO;
+            }
+
+            r = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+                        QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC);
+            if (r < 0) {
+                fprintf(stderr,
+                        "rdma migration: qemu_savevm_state_iterate"
+                        " sync polling error!\n");
+                return -EIO;
+            }
+        }
+#endif
+
         return ret;
     }
+
     ret = qemu_file_get_error(f);
     if (ret != 0) {
         qemu_savevm_state_cancel();
@@ -1684,7 +1714,7 @@  int qemu_savevm_state_complete(QEMUFile *f)
         int len;
 
         if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
-	    continue;
+            continue;
         }
         trace_savevm_section_start();
         /* Section type */
@@ -1703,8 +1733,32 @@  int qemu_savevm_state_complete(QEMUFile *f)
         trace_savevm_section_end(se->section_id);
     }
 
+    /*
+     * We use two "sync" infiniband messages happen during migration.
+     * One at the beginning and one at the end, just to be thorough.
+     * This is the second one.
+     */
+    if (qemu_rdma_migration_enabled()) {
+        if (qemu_rdma_migration_post_send_sync(&rdma_mdata,
+                    QEMU_RDMA_MIGRATION_WRID_SEND_SYNC)) {
+            fprintf(stderr, "rdma migration: error posting send sync!\n");
+            return -EIO;
+        }
+    }
+
     qemu_put_byte(f, QEMU_VM_EOF);
 
+    /* wait for RDMA sync message to complete */
+    if (qemu_rdma_migration_enabled()) {
+        int ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+                QEMU_RDMA_MIGRATION_WRID_SEND_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_savevm_state_full"
+                            " sync polling error!\n");
+            return -EIO;
+        }
+     }
+
     return qemu_file_get_error(f);
 }
 
@@ -2014,8 +2068,18 @@  int qemu_loadvm_state(QEMUFile *f)
 
     cpu_synchronize_all_post_init();
 
-    ret = 0;
+    /* wait for RDMA sync message */
+    if (qemu_rdma_migration_enabled()) {
+        ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata,
+                QEMU_RDMA_MIGRATION_WRID_RECV_SYNC);
+        if (ret < 0) {
+            fprintf(stderr, "rdma migration: qemu_loadvm_state_no_header"
+                            " sync polling error!\n");
+            goto out;
+        }
+    }
 
+    ret = 0;
 out:
     QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
         QLIST_REMOVE(le, entry);