Patchwork [RFC,RDMA,support,v2:,5/6] connection-setup code between client/server

login
register
mail settings
Submitter mrhines@linux.vnet.ibm.com
Date Feb. 11, 2013, 10:49 p.m.
Message ID <1360622997-26904-5-git-send-email-mrhines@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/219692/
State New
Headers show

Comments

mrhines@linux.vnet.ibm.com - Feb. 11, 2013, 10:49 p.m.
From: "Michael R. Hines" <mrhines@us.ibm.com>


Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 migration-tcp.c |   19 +++++++++++++++++++
 migration.c     |   53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 71 insertions(+), 1 deletion(-)
Orit Wasserman - Feb. 13, 2013, 8:46 a.m.
On 02/12/2013 12:49 AM, Michael R. Hines wrote:
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  migration-tcp.c |   19 +++++++++++++++++++
>  migration.c     |   53 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 71 insertions(+), 1 deletion(-)
> 
> diff --git a/migration-tcp.c b/migration-tcp.c
> index e78a296..113576e 100644
> --- a/migration-tcp.c
> +++ b/migration-tcp.c
> @@ -14,6 +14,7 @@
>   */
>  
>  #include "qemu-common.h"
> +#include "qemu/rdma.h"
>  #include "qemu/sockets.h"
>  #include "migration/migration.h"
>  #include "migration/qemu-file.h"
> @@ -55,6 +56,9 @@ static void tcp_wait_for_connect(int fd, void *opaque)
>  
>      if (fd < 0) {
>          DPRINTF("migrate connect error\n");
> +        if (migration_use_rdma()) {
> +            rdma_cleanup(&rdma_mdata);
> +        }

Those should be moved to migration-rdma.c
Are you still using the tcp for transferring device state?
If so you can call the tcp functions from the migration rdma code as a first step
but I would prefer it to use RDMA too.

>          s->fd = -1;
>          migrate_fd_error(s);
>      } else {
> @@ -62,6 +66,10 @@ static void tcp_wait_for_connect(int fd, void *opaque)
>          s->fd = fd;
>          socket_set_block(s->fd);
>          migrate_fd_connect(s);
> +
> +        if (migration_use_rdma() && rdma_wait_for_connect(fd, opaque)) {
> +            migrate_fd_error(s);
> +        }
See above
>      }
>  }
>  
> @@ -101,6 +109,12 @@ static void tcp_accept_incoming_migration(void *opaque)
>          goto out;
>      }
>  
> +    if (migration_use_rdma() && 
> +        rdma_accept_incoming_migration(&rdma_mdata)) {
> +        close(s);
> +        return;
> +    }
> +
See above
>      process_incoming_migration(f);
>      return;
>  
> @@ -117,6 +131,11 @@ void tcp_start_incoming_migration(const char *host_port, Error **errp)
>          return;
>      }
>  
> +    if (migration_use_rdma() && rdma_start_incoming_migration(s)) {
> +        close(s);
> +        return;
> +    }
> +
See above
>      qemu_set_fd_handler2(s, NULL, tcp_accept_incoming_migration, NULL,
>                           (void *)(intptr_t)s);
>  }
> diff --git a/migration.c b/migration.c
> index 77c1971..5ef923d 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -22,6 +22,7 @@
>  #include "qemu/sockets.h"
>  #include "migration/block.h"
>  #include "qemu/thread.h"
> +#include "qemu/rdma.h"
>  #include "qmp-commands.h"
>  
>  //#define DEBUG_MIGRATION
> @@ -262,6 +263,16 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
>      }
>  
>      for (cap = params; cap; cap = cap->next) {
> +        if (cap->value->capability == MIGRATION_CAPABILITY_RDMA) {
> +#ifndef CONFIG_RDMA
> +            error_set(errp, QERR_MIGRATION_RDMA_NOT_ENABLED);
> +            continue;
> +#endif
> +            if (!migration_use_rdma()) {
> +                error_set(errp, QERR_MIGRATION_RDMA_NOT_CONFIGURED, rdma_mdata.host, rdma_mdata.port);
> +                continue;
> +            }
> +        }

I would let the user to enable the RDMA capability first than set the RDMA host/port.

>          s->enabled_capabilities[cap->value->capability] = cap->value->state;
>      }
>  }
> @@ -279,6 +290,11 @@ static int migrate_fd_cleanup(MigrationState *s)
>      }
>  
>      assert(s->fd == -1);
> +
> +    if (migrate_rdma_enabled()) {
> +        rdma_cleanup(&rdma_mdata);
> +    }
> +
>      return ret;
>  }
>  
> @@ -386,6 +402,9 @@ static MigrationState *migrate_init(const MigrationParams *params)
>      s->params = *params;
>      memcpy(s->enabled_capabilities, enabled_capabilities,
>             sizeof(enabled_capabilities));
> +
> +    rdma_update_capability(s);
Why? the user/management needs to enable the capability manually.
The fact that you can set the capability means this QEMU version supports it.
> +
>      s->xbzrle_cache_size = xbzrle_cache_size;
>  
>      s->bandwidth_limit = bandwidth_limit;
> @@ -481,6 +500,29 @@ int64_t qmp_query_migrate_cache_size(Error **errp)
>      return migrate_xbzrle_cache_size();
>  }
>  
> +void qmp_migrate_set_rdma_port(int64_t port, Error **errp)
> +{
> +    MigrationState *s = migrate_get_current();
> +    if (s && (s->state == MIG_STATE_ACTIVE)) {
> +        return;
> +    }
> +    DPRINTF("rdma migration port: %" PRId64 "\n", port);
> +    rdma_mdata.port = port;
> +    rdma_update_capability(s);
I would allow setting the port only if the capability is set.
> +}
> +
> +void qmp_migrate_set_rdma_host(const char *host, Error **errp)
> +{
> +    MigrationState *s = migrate_get_current();
> +    if (s && (s->state == MIG_STATE_ACTIVE)) {
> +        return;
> +    }
> +    DPRINTF("rdma migration host name: %s\n", host);
> +    strncpy(rdma_mdata.host, host, 64);
> +    rdma_mdata.host[63] = '\0';
> +    rdma_update_capability(s);
see above.
> +}
> +
>  void qmp_migrate_set_speed(int64_t value, Error **errp)
>  {
>      MigrationState *s;
> @@ -505,6 +547,15 @@ int migrate_use_xbzrle(void)
>  {
>      MigrationState *s;
>  
> +    /*
> +     * RFC RDMA: time(run-length encoding) +
> +     *           time(communication) is too big. RDMA throughput tanks
> +     *           when this feature is enabled. But there's no need
> +     *           to change the code since the feature is optional.
> +     */
> +    if (migrate_rdma_enabled())
> +        return 0;
> +

If you won't allow enabling one capability when the other is active,
you won't need this check.

Regards,
Orit
>      s = migrate_get_current();
>  
>      return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
> @@ -571,7 +622,7 @@ static int buffered_put_buffer(void *opaque, const uint8_t *buf,
>      }
>  
>      if (size > (s->buffer_capacity - s->buffer_size)) {
> -        DPRINTF("increasing buffer capacity from %zu by %zu\n",
> +        DPRINTF("increasing buffer capacity from %zu by %d\n",
>                  s->buffer_capacity, size + 1024);
>  
>          s->buffer_capacity += size + 1024;
>
mrhines@linux.vnet.ibm.com - Feb. 14, 2013, 7:29 p.m.
Orit (and anthony if you're not busy),

I forgot to respond to this very important comment:

On 02/13/2013 03:46 AM, Orit Wasserman wrote:
> Are you still using the tcp for transferring device state? If so you 
> can call the tcp functions from the migration rdma code as a first 
> step but I would prefer it to use RDMA too.

This is the crux of the problem of using RDMA for migration: Currently 
all of the QEMU migration control logic and device state goes through 
the the QEMUFile implementation. RDMA, however is by nature a zero-copy 
solution and is incompatible with QEMUFile.

Using RDMA for transferring device state is not recommended: Setuping an 
RDMA requires registering the memory locations on both sides with the 
RDMA hardware. This is not ideal because this would require pinning the 
memory holding the device state and then issuing the RDMA transfer for 
*each* type of device - which would require changing the control path of 
every type of migrated device in QEMU.

Currently the Patch we submitted bypasses QEMUFile. It does just issues 
the RDMA transfer for the memory that was dirtied and then continues 
along with the rest of the migration call path normally.

In an ideal world, we would prefer a hyrbid approach, something like:

*Begin Migration Iteration Round:*
1. stop VCPU
2. start iterative pass over memory
3. send control signals (if any) / device state to QEMUFile
4. When a dirty memory page is found, do:
      a) Instruct the QEMUFile to block
      b) Issue the RDMA transfer
      c) Instruct the QEMUFile to unblock
5. resume VCPU

This would require a "smarter" QEMUFile implementation that understands 
when to block and for how long.

Comments?

- Michael
Orit Wasserman - Feb. 18, 2013, 8:24 a.m.
On 02/14/2013 09:29 PM, Michael R. Hines wrote:
> Orit (and anthony if you're not busy),
> 
> I forgot to respond to this very important comment:
> 
> On 02/13/2013 03:46 AM, Orit Wasserman wrote:
>> Are you still using the tcp for transferring device state? If so you can call the tcp functions from the migration rdma code as a first step but I would prefer it to use RDMA too.
> 
> This is the crux of the problem of using RDMA for migration: Currently all of the QEMU migration control logic and device state goes through the the QEMUFile implementation. RDMA, however is by nature a zero-copy solution and is incompatible with QEMUFile.
> 
> Using RDMA for transferring device state is not recommended: Setuping an RDMA requires registering the memory locations on both sides with the RDMA hardware. This is not ideal because this would require pinning the memory holding the device state and then issuing the RDMA transfer for *each* type of device - which would require changing the control path of every type of migrated device in QEMU.
> 
Hi Michael,

The guest device state is quite small (~100K probably less) especially when compared to
 the guest memory and we already are pinning the guest memory for RDMA any way
I was actually wondering about the memory pinning, 
do we pin all guest memory pages as migration starts or on demand? 

> Currently the Patch we submitted bypasses QEMUFile. It does just issues the RDMA transfer for the memory that was dirtied and then continues along with the rest of the migration call path normally.
> 
> In an ideal world, we would prefer a hyrbid approach, something like:
> 
> *Begin Migration Iteration Round:*
> 1. stop VCPU

The vcpus are only stopped in the last phase iteration is done while the guest is running ...

> 2. start iterative pass over memory
> 3. send control signals (if any) / device state to QEMUFile

device state is only sent in the last phase

> 4. When a dirty memory page is found, do:
>      a) Instruct the QEMUFile to block

If there is nothing to send there is no need to block ...

>      b) Issue the RDMA transfer
>      c) Instruct the QEMUFile to unblock
> 5. resume VCPU
no need.
> 
> This would require a "smarter" QEMUFile implementation that understands when to block and for how long.
> 

For the guest memory pages, sending the pages directly without QemuFile (that does buffering) is better,
I would suggest implementing an QemuRDMAFile for this.
It will have a new API for the memory pages (zero copy) so instead of using qemu_put_buffer 
we will call qemu_rdma_buffer or it can reimplement qemu_put_buffer (you need to add offset).

As for device state which is sent in the last phase and is small you can modify the current implementation.
(well Paolo sent patches that are changing this but I think buffering is still an option) 
The current migration implementation copies the device state into a buffer
and than send the data from the buffer (QemuBufferedFile).
You only need to pin this buffer, and RDMA it after all the device state was written into it.

Regards,
Orit
> Comments?
> 
> - Michael
Paolo Bonzini - Feb. 18, 2013, 10:52 a.m.
Il 14/02/2013 20:29, Michael R. Hines ha scritto:
> 
>> Are you still using the tcp for transferring device state? If so you
>> can call the tcp functions from the migration rdma code as a first
>> step but I would prefer it to use RDMA too.
> 
> This is the crux of the problem of using RDMA for migration: Currently
> all of the QEMU migration control logic and device state goes through
> the the QEMUFile implementation. RDMA, however is by nature a zero-copy
> solution and is incompatible with QEMUFile.

With the patches I sent yesterday, there is no more buffering involved
in migration.c.  All data goes straight from arch_init.c to a QEMUFile.

QEMUFile still does some buffering, but this should change with other
patches that Orit is working on.

> Using RDMA for transferring device state is not recommended: Setuping an
> RDMA requires registering the memory locations on both sides with the
> RDMA hardware. This is not ideal because this would require pinning the
> memory holding the device state and then issuing the RDMA transfer for
> *each* type of device - which would require changing the control path of
> every type of migrated device in QEMU.

Yes, this would not work well.  However, you can (I think) define a
QEMUFileOps implementation for RDMA that:

1) registers the buffer of a QEMUFile with the RDMA hardware;

2) in its get_buffer (receive) and put_buffer (send) callbacks, issues a
synchronous RDMA transfer;

3) unregisters the buffer in the close callback.

As a proof of concept, this would also work (though it would make no
sense) for transferring the RAM; in the end of course it would be used
only for the device state.

It's not a problem to add more operations to QEMUFileOps or similar.
It's also not a problem to change the way buf is allocated, if you need
it to be page-aligned or something like that.

It is much better than adding migration_use_rdma() everywhere.

Paolo

> 
> Currently the Patch we submitted bypasses QEMUFile. It does just issues
> the RDMA transfer for the memory that was dirtied and then continues
> along with the rest of the migration call path normally.
> 
> In an ideal world, we would prefer a hyrbid approach, something like:
> 
> *Begin Migration Iteration Round:*
> 1. stop VCPU
> 2. start iterative pass over memory
> 3. send control signals (if any) / device state to QEMUFile
> 4. When a dirty memory page is found, do:
>      a) Instruct the QEMUFile to block
>      b) Issue the RDMA transfer
>      c) Instruct the QEMUFile to unblock
> 5. resume VCPU
> 
> This would require a "smarter" QEMUFile implementation that understands
> when to block and for how long.
mrhines@linux.vnet.ibm.com - Feb. 19, 2013, 6:03 a.m.
This sounds great. My team was just discussing customizing QEMUFileOps 
to be more "rdma-friendly".

We weren't certain if it would be well received, so this is good to know.

I'll try to scrounge up a patch of some kind before sending out a v3 and 
then do my best to get rid of the migrate_use_rdma() checks 
everywhere.......

Thanks for the suggestion,

- Michael

On 02/18/2013 05:52 AM, Paolo Bonzini wrote:
> Il 14/02/2013 20:29, Michael R. Hines ha scritto:
>>> Are you still using the tcp for transferring device state? If so you
>>> can call the tcp functions from the migration rdma code as a first
>>> step but I would prefer it to use RDMA too.
>> This is the crux of the problem of using RDMA for migration: Currently
>> all of the QEMU migration control logic and device state goes through
>> the the QEMUFile implementation. RDMA, however is by nature a zero-copy
>> solution and is incompatible with QEMUFile.
> With the patches I sent yesterday, there is no more buffering involved
> in migration.c.  All data goes straight from arch_init.c to a QEMUFile.
>
> QEMUFile still does some buffering, but this should change with other
> patches that Orit is working on.
>
>> Using RDMA for transferring device state is not recommended: Setuping an
>> RDMA requires registering the memory locations on both sides with the
>> RDMA hardware. This is not ideal because this would require pinning the
>> memory holding the device state and then issuing the RDMA transfer for
>> *each* type of device - which would require changing the control path of
>> every type of migrated device in QEMU.
> Yes, this would not work well.  However, you can (I think) define a
> QEMUFileOps implementation for RDMA that:
>
> 1) registers the buffer of a QEMUFile with the RDMA hardware;
>
> 2) in its get_buffer (receive) and put_buffer (send) callbacks, issues a
> synchronous RDMA transfer;
>
> 3) unregisters the buffer in the close callback.
>
> As a proof of concept, this would also work (though it would make no
> sense) for transferring the RAM; in the end of course it would be used
> only for the device state.
>
> It's not a problem to add more operations to QEMUFileOps or similar.
> It's also not a problem to change the way buf is allocated, if you need
> it to be page-aligned or something like that.
>
> It is much better than adding migration_use_rdma() everywhere.
>
> Paolo
mrhines@linux.vnet.ibm.com - Feb. 19, 2013, 3:41 p.m.
On 02/18/2013 03:24 AM, Orit Wasserman wrote:
> Hi Michael, The guest device state is quite small (~100K probably 
> less) especially when compared to the guest memory and we already are 
> pinning the guest memory for RDMA any way I was actually wondering 
> about the memory pinning, do we pin all guest memory pages as 
> migration starts or on demand? 

The patch supports both methods. There's a function called 
"rdma_server_prepare()" which optionally pins all the memory in advanced.

We prefer on-demand pinning, of course, because we want to preserve the 
ability to do ballooning and the occasional madvise() calls.

The patch defaults to pinning everything right now for performance 
evaluation.......... later I'll make sure to switch that off when we've 
coalesced on a solution for the actual RDMA transfer itself.

> For the guest memory pages, sending the pages directly without 
> QemuFile (that does buffering) is better, I would suggest implementing 
> an QemuRDMAFile for this. It will have a new API for the memory pages 
> (zero copy) so instead of using qemu_put_buffer we will call 
> qemu_rdma_buffer or it can reimplement qemu_put_buffer (you need to 
> add offset). As for device state which is sent in the last phase and 
> is small you can modify the current implementation. (well Paolo sent 
> patches that are changing this but I think buffering is still an 
> option) The current migration implementation copies the device state 
> into a buffer and than send the data from the buffer 
> (QemuBufferedFile). You only need to pin this buffer, and RDMA it 
> after all the device state was written into it. Regards, Orit 
I like it .......... can you help me understand - how much different is 
this design than the "QEMUFileOps" design that Paulo suggested?

Or it basically the same? ....reimplementing 
qemu_put_buffer/get_buffer() for RDMA purposes......

- Michael
Orit Wasserman - Feb. 19, 2013, 4:39 p.m.
On 02/19/2013 05:41 PM, Michael R. Hines wrote:
> On 02/18/2013 03:24 AM, Orit Wasserman wrote:
>> Hi Michael, The guest device state is quite small (~100K probably less) especially when compared to the guest memory and we already are pinning the guest memory for RDMA any way I was actually wondering about the memory pinning, do we pin all guest memory pages as migration starts or on demand? 
> 
> The patch supports both methods. There's a function called "rdma_server_prepare()" which optionally pins all the memory in advanced.
> 
> We prefer on-demand pinning, of course, because we want to preserve the ability to do ballooning and the occasional madvise() calls.
> 
> The patch defaults to pinning everything right now for performance evaluation.......... later I'll make sure to switch that off when we've coalesced on a solution for the actual RDMA transfer itself.
Do you have some results as to the performance cost on demand pinning has?
> 
>> For the guest memory pages, sending the pages directly without QemuFile (that does buffering) is better, I would suggest implementing an QemuRDMAFile for this. It will have a new API for the memory pages (zero copy) so instead of using qemu_put_buffer we will call qemu_rdma_buffer or it can reimplement qemu_put_buffer (you need to add offset). As for device state which is sent in the last phase and is small you can modify the current implementation. (well Paolo sent patches that are changing this but I think buffering is still an option) The current migration implementation copies the device state into a buffer and than send the data from the buffer (QemuBufferedFile). You only need to pin this buffer, and RDMA it after all the device state was written into it. Regards, Orit 
> I like it .......... can you help me understand - how much different is this design than the "QEMUFileOps" design that Paulo suggested?
> 
> Or it basically the same? ....reimplementing qemu_put_buffer/get_buffer() for RDMA purposes......
> 
Yes it is basically the same :).
you will need the QEMUFileRDMA to store the rdma context (look at QEMUFileSocket for example) and other specific rdma parameters.
you will the rdma_file_ops (QEMUFileOps) to implement sending guest pages directly and pinning the buffer that contain the device state.
you won't need to change qemu_put_buffer for the device state but implement a rdma_put_buffer ops that will pin the buffer and send it.
As for the guest memory pages you have two options:
update qemu_put_buffer/get_buffer
add new QEMUFileOps just for rdma and call it instead of qemu_put_buffer for guest pages.

Cheers,
Orit
> - Michael
>
mrhines@linux.vnet.ibm.com - Feb. 19, 2013, 9:22 p.m.
On 02/19/2013 11:39 AM, Orit Wasserman wrote:
> Do you have some results as to the performance cost on demand pinning has?
Yes, there's a single graph on the QEMU wiki that shows this:

http://wiki.qemu.org/Features/RDMALiveMigration

Pay attention to the last group of bars, labelled "A", "B", and "C".

"A" is called "Chunked Page Registration" (i.e. on-demand pinning in 
groups of about 1MB of the surrounding memory pages)
"B" pins everything in advance

On-demand has only a slightly larger cost, but not much.

>>> For the guest memory pages, sending the pages directly without QemuFile (that does buffering) is better, I would suggest implementing an QemuRDMAFile for this. It will have a new API for the memory pages (zero copy) so instead of using qemu_put_buffer we will call qemu_rdma_buffer or it can reimplement qemu_put_buffer (you need to add offset). As for device state which is sent in the last phase and is small you can modify the current implementation. (well Paolo sent patches that are changing this but I think buffering is still an option) The current migration implementation copies the device state into a buffer and than send the data from the buffer (QemuBufferedFile). You only need to pin this buffer, and RDMA it after all the device state was written into it. Regards, Orit
>> I like it .......... can you help me understand - how much different is this design than the "QEMUFileOps" design that Paulo suggested?
>>
>> Or it basically the same? ....reimplementing qemu_put_buffer/get_buffer() for RDMA purposes......
>>
> Yes it is basically the same :).
> you will need the QEMUFileRDMA to store the rdma context (look at QEMUFileSocket for example) and other specific rdma parameters.
> you will the rdma_file_ops (QEMUFileOps) to implement sending guest pages directly and pinning the buffer that contain the device state.
> you won't need to change qemu_put_buffer for the device state but implement a rdma_put_buffer ops that will pin the buffer and send it.
> As for the guest memory pages you have two options:
> update qemu_put_buffer/get_buffer
> add new QEMUFileOps just for rdma and call it instead of qemu_put_buffer for guest pages.

Understood.... I have some code reading to do with these pointers, then. 
Will have to teach myself everything you just said =)
Paolo Bonzini - Feb. 19, 2013, 9:34 p.m.
Il 19/02/2013 22:22, Michael R. Hines ha scritto:
>> Yes it is basically the same :).
>> you will need the QEMUFileRDMA to store the rdma context (look at
>> QEMUFileSocket for example) and other specific rdma parameters.
>> you will the rdma_file_ops (QEMUFileOps) to implement sending guest
>> pages directly and pinning the buffer that contain the device state.
>> you won't need to change qemu_put_buffer for the device state but
>> implement a rdma_put_buffer ops that will pin the buffer and send it.
>> As for the guest memory pages you have two options:
>> update qemu_put_buffer/get_buffer
>> add new QEMUFileOps just for rdma and call it instead of
>> qemu_put_buffer for guest pages.
> 
> Understood.... I have some code reading to do with these pointers, then.
> Will have to teach myself everything you just said =)

Look at my migration-thread-20130115 branch
(git://github.com/bonzini/qemu.git).  It's much simpler than the current
code, and should be easier to understand and extend.

Paolo
Michael S. Tsirkin - Feb. 21, 2013, 8:14 p.m.
On Thu, Feb 14, 2013 at 02:29:09PM -0500, Michael R. Hines wrote:
> Orit (and anthony if you're not busy),
> 
> I forgot to respond to this very important comment:
> 
> On 02/13/2013 03:46 AM, Orit Wasserman wrote:
> 
>     Are you still using the tcp for transferring device state? If so you can
>     call the tcp functions from the migration rdma code as a first step but I
>     would prefer it to use RDMA too.
> 
> 
> This is the crux of the problem of using RDMA for migration: Currently all of
> the QEMU migration control logic and device state goes through the the QEMUFile
> implementation. RDMA, however is by nature a zero-copy solution and is
> incompatible with QEMUFile.

RDMA might be overkill but you could reuse the same connection,
using send instructions.

> Using RDMA for transferring device state is not recommended: Setuping an RDMA
> requires registering the memory locations on both sides with the RDMA hardware.
> This is not ideal because this would require pinning the memory holding the
> device state and then issuing the RDMA transfer for *each* type of device -
> which would require changing the control path of every type of migrated device
> in QEMU.
> 
> Currently the Patch we submitted bypasses QEMUFile. It does just issues the
> RDMA transfer for the memory that was dirtied and then continues along with the
> rest of the migration call path normally.
> 
> In an ideal world, we would prefer a hyrbid approach, something like:
> 
> Begin Migration Iteration Round:
> 1. stop VCPU
> 2. start iterative pass over memory
> 3. send control signals (if any) / device state to QEMUFile
> 4. When a dirty memory page is found, do:
>      a) Instruct the QEMUFile to block
>      b) Issue the RDMA transfer
>      c) Instruct the QEMUFile to unblock
> 5. resume VCPU
> 
> This would require a "smarter" QEMUFile implementation that understands when to
> block and for how long.
> 
> Comments?
> 
> - Michael
Michael S. Tsirkin - Feb. 21, 2013, 8:16 p.m.
On Tue, Feb 19, 2013 at 10:41:08AM -0500, Michael R. Hines wrote:
> On 02/18/2013 03:24 AM, Orit Wasserman wrote:
> >Hi Michael, The guest device state is quite small (~100K probably
> >less) especially when compared to the guest memory and we already
> >are pinning the guest memory for RDMA any way I was actually
> >wondering about the memory pinning, do we pin all guest memory
> >pages as migration starts or on demand?
> 
> The patch supports both methods. There's a function called
> "rdma_server_prepare()" which optionally pins all the memory in
> advanced.
> 
> We prefer on-demand pinning, of course, because we want to preserve
> the ability to do ballooning and the occasional madvise() calls.
> 
> The patch defaults to pinning everything right now for performance
> evaluation.......... later I'll make sure to switch that off when
> we've coalesced on a solution for the actual RDMA transfer itself.

How well does the incremental pinning work?
The pin everything approach does not look very useful
because it conflicts with memory overcommit.

> >For the guest memory pages, sending the pages directly without
> >QemuFile (that does buffering) is better, I would suggest
> >implementing an QemuRDMAFile for this. It will have a new API for
> >the memory pages (zero copy) so instead of using qemu_put_buffer
> >we will call qemu_rdma_buffer or it can reimplement
> >qemu_put_buffer (you need to add offset). As for device state
> >which is sent in the last phase and is small you can modify the
> >current implementation. (well Paolo sent patches that are changing
> >this but I think buffering is still an option) The current
> >migration implementation copies the device state into a buffer and
> >than send the data from the buffer (QemuBufferedFile). You only
> >need to pin this buffer, and RDMA it after all the device state
> >was written into it. Regards, Orit
> I like it .......... can you help me understand - how much different
> is this design than the "QEMUFileOps" design that Paulo suggested?
> 
> Or it basically the same? ....reimplementing
> qemu_put_buffer/get_buffer() for RDMA purposes......
> 
> - Michael
>
Michael S. Tsirkin - Feb. 21, 2013, 8:20 p.m.
I don't think people mind using RDMA specifically,
small amounts of data can be sent using SEND commands.
But it's cleaner not to use TCP for parts of data.


On Tue, Feb 19, 2013 at 01:03:59AM -0500, Michael R. Hines wrote:
> 
> This sounds great. My team was just discussing customizing
> QEMUFileOps to be more "rdma-friendly".
> 
> We weren't certain if it would be well received, so this is good to know.
> 
> I'll try to scrounge up a patch of some kind before sending out a v3
> and then do my best to get rid of the migrate_use_rdma() checks
> everywhere.......
> 
> Thanks for the suggestion,
> 
> - Michael
> 
> On 02/18/2013 05:52 AM, Paolo Bonzini wrote:
> >Il 14/02/2013 20:29, Michael R. Hines ha scritto:
> >>>Are you still using the tcp for transferring device state? If so you
> >>>can call the tcp functions from the migration rdma code as a first
> >>>step but I would prefer it to use RDMA too.
> >>This is the crux of the problem of using RDMA for migration: Currently
> >>all of the QEMU migration control logic and device state goes through
> >>the the QEMUFile implementation. RDMA, however is by nature a zero-copy
> >>solution and is incompatible with QEMUFile.
> >With the patches I sent yesterday, there is no more buffering involved
> >in migration.c.  All data goes straight from arch_init.c to a QEMUFile.
> >
> >QEMUFile still does some buffering, but this should change with other
> >patches that Orit is working on.
> >
> >>Using RDMA for transferring device state is not recommended: Setuping an
> >>RDMA requires registering the memory locations on both sides with the
> >>RDMA hardware. This is not ideal because this would require pinning the
> >>memory holding the device state and then issuing the RDMA transfer for
> >>*each* type of device - which would require changing the control path of
> >>every type of migrated device in QEMU.
> >Yes, this would not work well.  However, you can (I think) define a
> >QEMUFileOps implementation for RDMA that:
> >
> >1) registers the buffer of a QEMUFile with the RDMA hardware;
> >
> >2) in its get_buffer (receive) and put_buffer (send) callbacks, issues a
> >synchronous RDMA transfer;
> >
> >3) unregisters the buffer in the close callback.
> >
> >As a proof of concept, this would also work (though it would make no
> >sense) for transferring the RAM; in the end of course it would be used
> >only for the device state.
> >
> >It's not a problem to add more operations to QEMUFileOps or similar.
> >It's also not a problem to change the way buf is allocated, if you need
> >it to be page-aligned or something like that.
> >
> >It is much better than adding migration_use_rdma() everywhere.
> >
> >Paolo
>
mrhines@linux.vnet.ibm.com - Feb. 28, 2013, 11:24 p.m.
On 02/21/2013 03:20 PM, Michael S. Tsirkin wrote:
> I don't think people mind using RDMA specifically,
> small amounts of data can be sent using SEND commands.
> But it's cleaner not to use TCP for parts of data.
>
Understood. Still debating with my co-workers when and how long it will 
take to implement this.

Will keep everyone posted....

- Michael

Patch

diff --git a/migration-tcp.c b/migration-tcp.c
index e78a296..113576e 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -14,6 +14,7 @@ 
  */
 
 #include "qemu-common.h"
+#include "qemu/rdma.h"
 #include "qemu/sockets.h"
 #include "migration/migration.h"
 #include "migration/qemu-file.h"
@@ -55,6 +56,9 @@  static void tcp_wait_for_connect(int fd, void *opaque)
 
     if (fd < 0) {
         DPRINTF("migrate connect error\n");
+        if (migration_use_rdma()) {
+            rdma_cleanup(&rdma_mdata);
+        }
         s->fd = -1;
         migrate_fd_error(s);
     } else {
@@ -62,6 +66,10 @@  static void tcp_wait_for_connect(int fd, void *opaque)
         s->fd = fd;
         socket_set_block(s->fd);
         migrate_fd_connect(s);
+
+        if (migration_use_rdma() && rdma_wait_for_connect(fd, opaque)) {
+            migrate_fd_error(s);
+        }
     }
 }
 
@@ -101,6 +109,12 @@  static void tcp_accept_incoming_migration(void *opaque)
         goto out;
     }
 
+    if (migration_use_rdma() && 
+        rdma_accept_incoming_migration(&rdma_mdata)) {
+        close(s);
+        return;
+    }
+
     process_incoming_migration(f);
     return;
 
@@ -117,6 +131,11 @@  void tcp_start_incoming_migration(const char *host_port, Error **errp)
         return;
     }
 
+    if (migration_use_rdma() && rdma_start_incoming_migration(s)) {
+        close(s);
+        return;
+    }
+
     qemu_set_fd_handler2(s, NULL, tcp_accept_incoming_migration, NULL,
                          (void *)(intptr_t)s);
 }
diff --git a/migration.c b/migration.c
index 77c1971..5ef923d 100644
--- a/migration.c
+++ b/migration.c
@@ -22,6 +22,7 @@ 
 #include "qemu/sockets.h"
 #include "migration/block.h"
 #include "qemu/thread.h"
+#include "qemu/rdma.h"
 #include "qmp-commands.h"
 
 //#define DEBUG_MIGRATION
@@ -262,6 +263,16 @@  void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     }
 
     for (cap = params; cap; cap = cap->next) {
+        if (cap->value->capability == MIGRATION_CAPABILITY_RDMA) {
+#ifndef CONFIG_RDMA
+            error_set(errp, QERR_MIGRATION_RDMA_NOT_ENABLED);
+            continue;
+#endif
+            if (!migration_use_rdma()) {
+                error_set(errp, QERR_MIGRATION_RDMA_NOT_CONFIGURED, rdma_mdata.host, rdma_mdata.port);
+                continue;
+            }
+        }
         s->enabled_capabilities[cap->value->capability] = cap->value->state;
     }
 }
@@ -279,6 +290,11 @@  static int migrate_fd_cleanup(MigrationState *s)
     }
 
     assert(s->fd == -1);
+
+    if (migrate_rdma_enabled()) {
+        rdma_cleanup(&rdma_mdata);
+    }
+
     return ret;
 }
 
@@ -386,6 +402,9 @@  static MigrationState *migrate_init(const MigrationParams *params)
     s->params = *params;
     memcpy(s->enabled_capabilities, enabled_capabilities,
            sizeof(enabled_capabilities));
+
+    rdma_update_capability(s);
+
     s->xbzrle_cache_size = xbzrle_cache_size;
 
     s->bandwidth_limit = bandwidth_limit;
@@ -481,6 +500,29 @@  int64_t qmp_query_migrate_cache_size(Error **errp)
     return migrate_xbzrle_cache_size();
 }
 
+void qmp_migrate_set_rdma_port(int64_t port, Error **errp)
+{
+    MigrationState *s = migrate_get_current();
+    if (s && (s->state == MIG_STATE_ACTIVE)) {
+        return;
+    }
+    DPRINTF("rdma migration port: %" PRId64 "\n", port);
+    rdma_mdata.port = port;
+    rdma_update_capability(s);
+}
+
+void qmp_migrate_set_rdma_host(const char *host, Error **errp)
+{
+    MigrationState *s = migrate_get_current();
+    if (s && (s->state == MIG_STATE_ACTIVE)) {
+        return;
+    }
+    DPRINTF("rdma migration host name: %s\n", host);
+    strncpy(rdma_mdata.host, host, 64);
+    rdma_mdata.host[63] = '\0';
+    rdma_update_capability(s);
+}
+
 void qmp_migrate_set_speed(int64_t value, Error **errp)
 {
     MigrationState *s;
@@ -505,6 +547,15 @@  int migrate_use_xbzrle(void)
 {
     MigrationState *s;
 
+    /*
+     * RFC RDMA: time(run-length encoding) +
+     *           time(communication) is too big. RDMA throughput tanks
+     *           when this feature is enabled. But there's no need
+     *           to change the code since the feature is optional.
+     */
+    if (migrate_rdma_enabled())
+        return 0;
+
     s = migrate_get_current();
 
     return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
@@ -571,7 +622,7 @@  static int buffered_put_buffer(void *opaque, const uint8_t *buf,
     }
 
     if (size > (s->buffer_capacity - s->buffer_size)) {
-        DPRINTF("increasing buffer capacity from %zu by %zu\n",
+        DPRINTF("increasing buffer capacity from %zu by %d\n",
                 s->buffer_capacity, size + 1024);
 
         s->buffer_capacity += size + 1024;