Patchwork [RFC,RDMA,support,v4:,08/10] introduce QEMUFileRDMA

login
register
mail settings
Submitter mrhines@linux.vnet.ibm.com
Date March 18, 2013, 3:19 a.m.
Message ID <1363576743-6146-9-git-send-email-mrhines@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/228355/
State New
Headers show

Comments

mrhines@linux.vnet.ibm.com - March 18, 2013, 3:19 a.m.
From: "Michael R. Hines" <mrhines@us.ibm.com>

This compiles with and without --enable-rdma.

Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
---
 include/migration/qemu-file.h |   10 +++
 savevm.c                      |  172 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 172 insertions(+), 10 deletions(-)
Paolo Bonzini - March 18, 2013, 9:09 a.m.
Il 18/03/2013 04:19, mrhines@linux.vnet.ibm.com ha scritto:
> From: "Michael R. Hines" <mrhines@us.ibm.com>
> 
> This compiles with and without --enable-rdma.
> 
> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com>
> ---
>  include/migration/qemu-file.h |   10 +++
>  savevm.c                      |  172 ++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 172 insertions(+), 10 deletions(-)
> 
> diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
> index df81261..9046751 100644
> --- a/include/migration/qemu-file.h
> +++ b/include/migration/qemu-file.h
> @@ -51,23 +51,33 @@ typedef int (QEMUFileCloseFunc)(void *opaque);
>   */
>  typedef int (QEMUFileGetFD)(void *opaque);
>  
> +/* 
> + * 'drain' from a QEMUFile perspective means
> + * to flush the outbound send buffer
> + * (if one exists). (Only used by RDMA right now)
> + */
> +typedef int (QEMUFileDrainFunc)(void *opaque);
> +
>  typedef struct QEMUFileOps {
>      QEMUFilePutBufferFunc *put_buffer;
>      QEMUFileGetBufferFunc *get_buffer;
>      QEMUFileCloseFunc *close;
>      QEMUFileGetFD *get_fd;
> +    QEMUFileDrainFunc *drain;
>  } QEMUFileOps;
>  
>  QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
>  QEMUFile *qemu_fopen(const char *filename, const char *mode);
>  QEMUFile *qemu_fdopen(int fd, const char *mode);
>  QEMUFile *qemu_fopen_socket(int fd, const char *mode);
> +QEMUFile *qemu_fopen_rdma(void *opaque, const char *mode);
>  QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
>  int qemu_get_fd(QEMUFile *f);
>  int qemu_fclose(QEMUFile *f);
>  int64_t qemu_ftell(QEMUFile *f);
>  void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
>  void qemu_put_byte(QEMUFile *f, int v);
> +int qemu_drain(QEMUFile *f);
>  
>  static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
>  {
> diff --git a/savevm.c b/savevm.c
> index 35c8d1e..9b90b7f 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -32,6 +32,7 @@
>  #include "qemu/timer.h"
>  #include "audio/audio.h"
>  #include "migration/migration.h"
> +#include "migration/rdma.h"
>  #include "qemu/sockets.h"
>  #include "qemu/queue.h"
>  #include "sysemu/cpus.h"
> @@ -143,6 +144,13 @@ typedef struct QEMUFileSocket
>      QEMUFile *file;
>  } QEMUFileSocket;
>  
> +typedef struct QEMUFileRDMA
> +{
> +    void *rdma;

This is an RDMAData *.  Please avoid using void * as much as possible.

> +    size_t len;
> +    QEMUFile *file;
> +} QEMUFileRDMA;
> +
>  typedef struct {
>      Coroutine *co;
>      int fd;
> @@ -178,6 +186,66 @@ static int socket_get_fd(void *opaque)
>      return s->fd;
>  }
>  
> +/*
> + * SEND messages for none-live state only.
> + * pc.ram is handled elsewhere...
> + */
> +static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFileRDMA *r = opaque;
> +    size_t remaining = size;
> +    uint8_t * data = (void *) buf;
> +
> +    /*
> +     * Although we're sending non-live
> +     * state here, push out any writes that
> +     * we're queued up for pc.ram anyway.
> +     */
> +    if (qemu_rdma_write_flush(r->rdma) < 0)
> +        return -EIO;
> +
> +    while(remaining) {
> +        r->len = MIN(remaining, RDMA_SEND_INCREMENT);
> +        remaining -= r->len;
> +
> +        if(qemu_rdma_exchange_send(r->rdma, data, r->len) < 0)
> +                return -EINVAL;
> +
> +        data += r->len;
> +    }
> +
> +    return size;
> +} 
> +
> +/*
> + * RDMA links don't use bytestreams, so we have to
> + * return bytes to QEMUFile opportunistically.
> + */
> +static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
> +{
> +    QEMUFileRDMA *r = opaque;
> +
> +    /*
> +     * First, we hold on to the last SEND message we 
> +     * were given and dish out the bytes until we run 
> +     * out of bytes.
> +     */
> +    if((r->len = qemu_rdma_fill(r->rdma, buf, size)))
> +	return r->len; 
> +
> +     /*
> +      * Once we run out, we block and wait for another
> +      * SEND message to arrive.
> +      */
> +    if(qemu_rdma_exchange_recv(r->rdma) < 0)
> +	return -EINVAL;
> +
> +    /*
> +     * SEND was received with new bytes, now try again.
> +     */
> +    return qemu_rdma_fill(r->rdma, buf, size);
> +} 

Please move these functions closer to qemu_fopen_rdma (or better, to an
RDMA-specific file altogether).  Also, using qemu_rdma_fill introduces a
dependency of savevm.c on migration-rdma.c.  There should be no such
dependency; migration-rdma.c should be used only by migration.c.

>  static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
>  {
>      QEMUFileSocket *s = opaque;
> @@ -390,16 +458,24 @@ static const QEMUFileOps socket_write_ops = {
>      .close =      socket_close
>  };
>  
> -QEMUFile *qemu_fopen_socket(int fd, const char *mode)
> +static bool qemu_mode_is_not_valid(const char * mode)
>  {
> -    QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
> -
>      if (mode == NULL ||
>          (mode[0] != 'r' && mode[0] != 'w') ||
>          mode[1] != 'b' || mode[2] != 0) {
>          fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
> -        return NULL;
> +        return true;
>      }
> +    
> +    return false;
> +}
> +
> +QEMUFile *qemu_fopen_socket(int fd, const char *mode)
> +{
> +    QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
> +
> +    if(qemu_mode_is_not_valid(mode))
> +	return NULL;
>  
>      s->fd = fd;
>      if (mode[0] == 'w') {
> @@ -411,16 +487,66 @@ QEMUFile *qemu_fopen_socket(int fd, const char *mode)
>      return s->file;
>  }
>  
> +static int qemu_rdma_close(void *opaque)
> +{
> +    QEMUFileRDMA *r = opaque;
> +    if(r->rdma) {
> +        qemu_rdma_cleanup(r->rdma);
> +        g_free(r->rdma);
> +    }
> +    g_free(r);
> +    return 0;
> +}
> +
> +void * migrate_use_rdma(QEMUFile *f)
> +{
> +    QEMUFileRDMA *r = f->opaque;
> +
> +    return qemu_rdma_enabled(r->rdma) ? r->rdma : NULL;

You cannot be sure that f->opaque->rdma is a valid pointer.  For
example, the first field in a socket QEMUFile's is a file descriptor.

Instead, you could use a qemu_file_ops_are(const QEMUFile *, const
QEMUFileOps *) function that checks if the file uses the given ops.
Then, migrate_use_rdma can simply check if the QEMUFile is using the
RDMA ops structure.

With this change, the "enabled" field of RDMAData should go.

> +}
> +
> +static int qemu_rdma_drain_completion(void *opaque)
> +{
> +    QEMUFileRDMA *r = opaque;
> +    r->len = 0;
> +    return qemu_rdma_drain_cq(r->rdma);
> +}
> +
> +static const QEMUFileOps rdma_read_ops = {
> +    .get_buffer = qemu_rdma_get_buffer,
> +    .close =      qemu_rdma_close,
> +};
> +
> +static const QEMUFileOps rdma_write_ops = {
> +    .put_buffer = qemu_rdma_put_buffer,
> +    .close =      qemu_rdma_close,
> +    .drain =	  qemu_rdma_drain_completion,
> +};
> +
> +QEMUFile *qemu_fopen_rdma(void *opaque, const char * mode)
> +{
> +    QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
> +
> +    if(qemu_mode_is_not_valid(mode))
> +	return NULL;
> +
> +    r->rdma = opaque;
> +
> +    if (mode[0] == 'w') {
> +        r->file = qemu_fopen_ops(r, &rdma_write_ops);
> +    } else {
> +        r->file = qemu_fopen_ops(r, &rdma_read_ops);
> +    }
> +
> +    return r->file;
> +}
> +
>  QEMUFile *qemu_fopen(const char *filename, const char *mode)
>  {
>      QEMUFileStdio *s;
>  
> -    if (mode == NULL ||
> -	(mode[0] != 'r' && mode[0] != 'w') ||
> -	mode[1] != 'b' || mode[2] != 0) {
> -        fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
> -        return NULL;
> -    }
> +    if(qemu_mode_is_not_valid(mode))
> +	return NULL;
>  
>      s = g_malloc0(sizeof(QEMUFileStdio));
>  
> @@ -497,6 +623,24 @@ static void qemu_file_set_error(QEMUFile *f, int ret)
>      }
>  }
>  
> +/*
> + * Called only for RDMA right now at the end 
> + * of each live iteration of memory.
> + *
> + * 'drain' from a QEMUFile perspective means
> + * to flush the outbound send buffer
> + * (if one exists). 
> + *
> + * For RDMA, this means to make sure we've
> + * received completion queue (CQ) messages
> + * successfully for all of the RDMA writes
> + * that we requested.
> + */ 
> +int qemu_drain(QEMUFile *f)
> +{
> +    return f->ops->drain ? f->ops->drain(f->opaque) : 0;
> +}

Hmm, this is very similar to qemu_fflush, but not quite. :/

Why exactly is this needed?

>  /** Flushes QEMUFile buffer
>   *
>   */
> @@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
>  int64_t qemu_ftell(QEMUFile *f)
>  {
>      qemu_fflush(f);
> +    if(migrate_use_rdma(f))
> +	return delta_norm_mig_bytes_transferred();

Not needed, and another undesirable dependency (savevm.c ->
arch_init.c).  Just update f->pos in save_rdma_page.

This is taking shape.  Thanks for persevering!

Paolo

>      return f->pos;
>  }
>  
> @@ -1737,6 +1883,12 @@ void qemu_savevm_state_complete(QEMUFile *f)
>          }
>      }
>  
> +    if ((ret = qemu_drain(f)) < 0) {
> +	fprintf(stderr, "failed to drain RDMA first!\n");
> +        qemu_file_set_error(f, ret);
> +	return;
> +    }
> +
>      QTAILQ_FOREACH(se, &savevm_handlers, entry) {
>          int len;
>  
>
mrhines@linux.vnet.ibm.com - March 18, 2013, 8:33 p.m.
Comments inline - tell me what you think.......

On 03/18/2013 05:09 AM, Paolo Bonzini wrote:
> +typedef struct QEMUFileRDMA
> +{
> +    void *rdma;
> This is an RDMAData *.  Please avoid using void * as much as possible.
Acknowledged - forgot to move this to rdma.c, so it doesn't have to be 
void anymore.
>      */
> +    return qemu_rdma_fill(r->rdma, buf, size);
> +}
> Please move these functions closer to qemu_fopen_rdma (or better, to an
> RDMA-specific file altogether).  Also, using qemu_rdma_fill introduces a
> dependency of savevm.c on migration-rdma.c.  There should be no such
> dependency; migration-rdma.c should be used only by migration.c.
Acknowledged......
> +void * migrate_use_rdma(QEMUFile *f)
> +{
> +    QEMUFileRDMA *r = f->opaque;
> +
> +    return qemu_rdma_enabled(r->rdma) ? r->rdma : NULL;
> You cannot be sure that f->opaque->rdma is a valid pointer.  For
> example, the first field in a socket QEMUFile's is a file descriptor.
>
> Instead, you could use a qemu_file_ops_are(const QEMUFile *, const
> QEMUFileOps *) function that checks if the file uses the given ops.
> Then, migrate_use_rdma can simply check if the QEMUFile is using the
> RDMA ops structure.
>
> With this change, the "enabled" field of RDMAData should go.

Great - I like that...... will do....

> +/*
> + * Called only for RDMA right now at the end
> + * of each live iteration of memory.
> + *
> + * 'drain' from a QEMUFile perspective means
> + * to flush the outbound send buffer
> + * (if one exists).
> + *
> + * For RDMA, this means to make sure we've
> + * received completion queue (CQ) messages
> + * successfully for all of the RDMA writes
> + * that we requested.
> + */
> +int qemu_drain(QEMUFile *f)
> +{
> +    return f->ops->drain ? f->ops->drain(f->opaque) : 0;
> +}
> Hmm, this is very similar to qemu_fflush, but not quite. :/
>
> Why exactly is this needed?

Good idea - I'll replace drain with flush once I added
the  "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
that you recommended......


>>   /** Flushes QEMUFile buffer
>>    *
>>    */
>> @@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
>>   int64_t qemu_ftell(QEMUFile *f)
>>   {
>>       qemu_fflush(f);
>> +    if(migrate_use_rdma(f))
>> +	return delta_norm_mig_bytes_transferred();
> Not needed, and another undesirable dependency (savevm.c ->
> arch_init.c).  Just update f->pos in save_rdma_page.

f->pos isn't good enough because save_rdma_page does not
go through QEMUFile directly - only non-live state goes
through QEMUFile ....... pc.ram uses direct RDMA writes.

As a result, the position pointer does not get updated
and the accounting is missed........

- Michael
Paolo Bonzini - March 19, 2013, 9:18 a.m.
Il 18/03/2013 21:33, Michael R. Hines ha scritto:
>>
>> +int qemu_drain(QEMUFile *f)
>> +{
>> +    return f->ops->drain ? f->ops->drain(f->opaque) : 0;
>> +}
>> Hmm, this is very similar to qemu_fflush, but not quite. :/
>>
>> Why exactly is this needed?
> 
> Good idea - I'll replace drain with flush once I added
> the  "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
> that you recommended......

If I understand correctly, the problem is that save_rdma_page is
asynchronous and you have to wait for pending operations to do the
put_buffer protocol correctly.

Would it work to just do the "drain" in the put_buffer operation, if and
only if it was preceded by a save_rdma_page operation?

> 
>>>   /** Flushes QEMUFile buffer
>>>    *
>>>    */
>>> @@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
>>>   int64_t qemu_ftell(QEMUFile *f)
>>>   {
>>>       qemu_fflush(f);
>>> +    if(migrate_use_rdma(f))
>>> +    return delta_norm_mig_bytes_transferred();
>> Not needed, and another undesirable dependency (savevm.c ->
>> arch_init.c).  Just update f->pos in save_rdma_page.
> 
> f->pos isn't good enough because save_rdma_page does not
> go through QEMUFile directly - only non-live state goes
> through QEMUFile ....... pc.ram uses direct RDMA writes.
> 
> As a result, the position pointer does not get updated
> and the accounting is missed........

Yes, I am suggesting to modify f->pos in save_rdma_page instead.

Paolo
mrhines@linux.vnet.ibm.com - March 19, 2013, 1:12 p.m.
On 03/19/2013 05:18 AM, Paolo Bonzini wrote:
> Il 18/03/2013 21:33, Michael R. Hines ha scritto:
>>> +int qemu_drain(QEMUFile *f)
>>> +{
>>> +    return f->ops->drain ? f->ops->drain(f->opaque) : 0;
>>> +}
>>> Hmm, this is very similar to qemu_fflush, but not quite. :/
>>>
>>> Why exactly is this needed?
>> Good idea - I'll replace drain with flush once I added
>> the  "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
>> that you recommended......
> If I understand correctly, the problem is that save_rdma_page is
> asynchronous and you have to wait for pending operations to do the
> put_buffer protocol correctly.
>
> Would it work to just do the "drain" in the put_buffer operation, if and
> only if it was preceded by a save_rdma_page operation?

Yes, the drain needs to happen in a few places already:

1. During save_rdma_page (if the current "chunk" is full of pages)
2. During the end of each iteration (now using qemu_fflush in my current 
patch)
3. And also during qemu_savem_state_complete(), also using qemu_fflush.
>>>>    /** Flushes QEMUFile buffer
>>>>     *
>>>>     */
>>>> @@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
>>>>    int64_t qemu_ftell(QEMUFile *f)
>>>>    {
>>>>        qemu_fflush(f);
>>>> +    if(migrate_use_rdma(f))
>>>> +    return delta_norm_mig_bytes_transferred();
>>> Not needed, and another undesirable dependency (savevm.c ->
>>> arch_init.c).  Just update f->pos in save_rdma_page.
>> f->pos isn't good enough because save_rdma_page does not
>> go through QEMUFile directly - only non-live state goes
>> through QEMUFile ....... pc.ram uses direct RDMA writes.
>>
>> As a result, the position pointer does not get updated
>> and the accounting is missed........
> Yes, I am suggesting to modify f->pos in save_rdma_page instead.
>
> Paolo
>

Would that not confuse the other QEMUFile users?
If I change that pointer (without actually putting bytes
in into QEMUFile), won't the f->pos pointer be
incorrectly updated?
Paolo Bonzini - March 19, 2013, 1:25 p.m.
Il 19/03/2013 14:12, Michael R. Hines ha scritto:
> On 03/19/2013 05:18 AM, Paolo Bonzini wrote:
>> Il 18/03/2013 21:33, Michael R. Hines ha scritto:
>>>> +int qemu_drain(QEMUFile *f)
>>>> +{
>>>> +    return f->ops->drain ? f->ops->drain(f->opaque) : 0;
>>>> +}
>>>> Hmm, this is very similar to qemu_fflush, but not quite. :/
>>>>
>>>> Why exactly is this needed?
>>> Good idea - I'll replace drain with flush once I added
>>> the  "qemu_file_ops_are(const QEMUFile *, const QEMUFileOps *) "
>>> that you recommended......
>> If I understand correctly, the problem is that save_rdma_page is
>> asynchronous and you have to wait for pending operations to do the
>> put_buffer protocol correctly.
>>
>> Would it work to just do the "drain" in the put_buffer operation, if and
>> only if it was preceded by a save_rdma_page operation?
> 
> Yes, the drain needs to happen in a few places already:
> 
> 1. During save_rdma_page (if the current "chunk" is full of pages)

Ok, this is internal to RDMA so no problem.

> 2. During the end of each iteration (now using qemu_fflush in my current
> patch)

Why?

> 3. And also during qemu_savem_state_complete(), also using qemu_fflush.

This would be caught by put_buffer, but (2) would not.

>>>>>    /** Flushes QEMUFile buffer
>>>>>     *
>>>>>     */
>>>>> @@ -723,6 +867,8 @@ int qemu_get_byte(QEMUFile *f)
>>>>>    int64_t qemu_ftell(QEMUFile *f)
>>>>>    {
>>>>>        qemu_fflush(f);
>>>>> +    if(migrate_use_rdma(f))
>>>>> +    return delta_norm_mig_bytes_transferred();
>>>> Not needed, and another undesirable dependency (savevm.c ->
>>>> arch_init.c).  Just update f->pos in save_rdma_page.
>>> f->pos isn't good enough because save_rdma_page does not
>>> go through QEMUFile directly - only non-live state goes
>>> through QEMUFile ....... pc.ram uses direct RDMA writes.
>>>
>>> As a result, the position pointer does not get updated
>>> and the accounting is missed........
>> Yes, I am suggesting to modify f->pos in save_rdma_page instead.
>>
>> Paolo
>>
> 
> Would that not confuse the other QEMUFile users?
> If I change that pointer (without actually putting bytes
> in into QEMUFile), won't the f->pos pointer be
> incorrectly updated?

f->pos is never used directly by QEMUFile, it is almost an opaque value.
 It is accumulated on every qemu_fflush (so that it can be passed to the
->put_buffer function), and returned by qemu_ftell; nothing else.

If you make somehow save_rdma_page a new op, returning a value from that
op and adding it to f->pos would be a good way to achieve this.

Paolo
mrhines@linux.vnet.ibm.com - March 19, 2013, 1:40 p.m.
On 03/19/2013 09:25 AM, Paolo Bonzini wrote:
> Yes, the drain needs to happen in a few places already:
>
> 1. During save_rdma_page (if the current "chunk" is full of pages)
> Ok, this is internal to RDMA so no problem.
>
>> 2. During the end of each iteration (now using qemu_fflush in my current
>> patch)
> Why?

This is because of downtime: You have to drain the queue anyway at the
very end, and if you don't drain it in advance after each iteration, then
the queue will have lots of bytes in it waiting for transmission and the
Virtual Machine will be stopped for a much longer period of time during
the last iteration waiting for RDMA card to finish transmission of all those
bytes.

If you wait till the last iteration to do this, then all of that waiting 
time gets
counted as downtime, causing the VCPUs to be unnecessarily stopped.


>> 3. And also during qemu_savem_state_complete(), also using qemu_fflush.
> This would be caught by put_buffer, but (2) would not.
>

I'm not sure this is good enough either - we don't want to flush
the queue *frequently*..... only when it's necessary for performance
.... we do want the queue to have some meat to it so the hardware
can write bytes as fast as possible.....

If we flush inside put_buffer (which is called very frequently): then
we have no way to distinquish *where* put buffer was called from
(either from qemu_savevm_state_complete() or from a device-level
function call that's using QEMUFile).

>>> Yes, I am suggesting to modify f->pos in save_rdma_page instead.
>>>
>>> Paolo
>>>
>> Would that not confuse the other QEMUFile users?
>> If I change that pointer (without actually putting bytes
>> in into QEMUFile), won't the f->pos pointer be
>> incorrectly updated?
> f->pos is never used directly by QEMUFile, it is almost an opaque value.
>   It is accumulated on every qemu_fflush (so that it can be passed to the
> ->put_buffer function), and returned by qemu_ftell; nothing else.
>
> If you make somehow save_rdma_page a new op, returning a value from that
> op and adding it to f->pos would be a good way to achieve this.

Ok, great - I'll take advantage of that........Thanks.
Paolo Bonzini - March 19, 2013, 1:45 p.m.
Il 19/03/2013 14:40, Michael R. Hines ha scritto:
> On 03/19/2013 09:25 AM, Paolo Bonzini wrote:
>> Yes, the drain needs to happen in a few places already:
>>
>> 1. During save_rdma_page (if the current "chunk" is full of pages)
>> Ok, this is internal to RDMA so no problem.
>>
>>> 2. During the end of each iteration (now using qemu_fflush in my current
>>> patch)
>> Why?
> 
> This is because of downtime: You have to drain the queue anyway at the
> very end, and if you don't drain it in advance after each iteration, then
> the queue will have lots of bytes in it waiting for transmission and the
> Virtual Machine will be stopped for a much longer period of time during
> the last iteration waiting for RDMA card to finish transmission of all
> those
> bytes.

Shouldn't the "current chunk full" case take care of it too?

Of course if you disable chunking you have to add a different condition,
perhaps directly into save_rdma_page.

> If you wait till the last iteration to do this, then all of that waiting time gets
> counted as downtime, causing the VCPUs to be unnecessarily stopped.
> 
>>> 3. And also during qemu_savem_state_complete(), also using qemu_fflush.
>> This would be caught by put_buffer, but (2) would not.
>>
> 
> I'm not sure this is good enough either - we don't want to flush
> the queue *frequently*..... only when it's necessary for performance
> .... we do want the queue to have some meat to it so the hardware
> can write bytes as fast as possible.....
> 
> If we flush inside put_buffer (which is called very frequently):

Is it called at any time during RAM migration?

> then we have no way to distinquish *where* put buffer was called from
> (either from qemu_savevm_state_complete() or from a device-level
> function call that's using QEMUFile).

Can you make drain a no-op if there is nothing in flight?  Then every
call to put_buffer after the first should not have any overhead.

Paolo

>>>> Yes, I am suggesting to modify f->pos in save_rdma_page instead.
>>>>
>>>> Paolo
>>>>
>>> Would that not confuse the other QEMUFile users?
>>> If I change that pointer (without actually putting bytes
>>> in into QEMUFile), won't the f->pos pointer be
>>> incorrectly updated?
>> f->pos is never used directly by QEMUFile, it is almost an opaque value.
>>   It is accumulated on every qemu_fflush (so that it can be passed to the
>> ->put_buffer function), and returned by qemu_ftell; nothing else.
>>
>> If you make somehow save_rdma_page a new op, returning a value from that
>> op and adding it to f->pos would be a good way to achieve this.
> 
> Ok, great - I'll take advantage of that........Thanks.
>
mrhines@linux.vnet.ibm.com - March 19, 2013, 2:10 p.m.
On 03/19/2013 09:45 AM, Paolo Bonzini wrote:
> This is because of downtime: You have to drain the queue anyway at the
> very end, and if you don't drain it in advance after each iteration, then
> the queue will have lots of bytes in it waiting for transmission and the
> Virtual Machine will be stopped for a much longer period of time during
> the last iteration waiting for RDMA card to finish transmission of all
> those
> bytes.
> Shouldn't the "current chunk full" case take care of it too?
>
> Of course if you disable chunking you have to add a different condition,
> perhaps directly into save_rdma_page.

No, we don't want to flush on "chunk full" - that has a different meaning.
We want to have as many chunks submitted to the hardware for transmission
as possible to keep the bytes moving.


>>>> 3. And also during qemu_savem_state_complete(), also using qemu_fflush.
>>> This would be caught by put_buffer, but (2) would not.
>>>
>> I'm not sure this is good enough either - we don't want to flush
>> the queue *frequently*..... only when it's necessary for performance
>> .... we do want the queue to have some meat to it so the hardware
>> can write bytes as fast as possible.....
>>
>> If we flush inside put_buffer (which is called very frequently):
> Is it called at any time during RAM migration?

I don't understand the question: the flushing we've been discussing
is *only* for RAM migration - not for the non-live state.

I haven't introduced any "new" flushes for non-live state other than
when it's absolutely necessary to flush for RAM migration.


>> then we have no way to distinquish *where* put buffer was called from
>> (either from qemu_savevm_state_complete() or from a device-level
>> function call that's using QEMUFile).
> Can you make drain a no-op if there is nothing in flight?  Then every
> call to put_buffer after the first should not have any overhead.
>
> Paolo

That still doesn't solve the problem: If there is nothing in flight,
then there is no reason to call qemu_fflush() in the first place.

This is why I avoided using fflush() in the beginning, because it
sort of "confuses" who is using it: from the perspective of fflush(),
you can't tell if the user calling it for RAM or for non-live state.

The flushes we need are only for RAM, not the rest of it......

Make sense?
Paolo Bonzini - March 19, 2013, 2:22 p.m.
Il 19/03/2013 15:10, Michael R. Hines ha scritto:
> On 03/19/2013 09:45 AM, Paolo Bonzini wrote:
>> This is because of downtime: You have to drain the queue anyway at the
>> very end, and if you don't drain it in advance after each iteration, then
>> the queue will have lots of bytes in it waiting for transmission and the
>> Virtual Machine will be stopped for a much longer period of time during
>> the last iteration waiting for RDMA card to finish transmission of all
>> those
>> bytes.
>> Shouldn't the "current chunk full" case take care of it too?
>>
>> Of course if you disable chunking you have to add a different condition,
>> perhaps directly into save_rdma_page.
> 
> No, we don't want to flush on "chunk full" - that has a different meaning.
> We want to have as many chunks submitted to the hardware for transmission
> as possible to keep the bytes moving.

That however gives me an idea...  Instead of the full drain at the end
of an iteration, does it make sense to do a "partial" drain at every
chunk full, so that you don't have > N bytes pending and the downtime is
correspondingly limited?

>>>>> 3. And also during qemu_savem_state_complete(), also using
>>>>> qemu_fflush.
>>>> This would be caught by put_buffer, but (2) would not.
>>>>
>>> I'm not sure this is good enough either - we don't want to flush
>>> the queue *frequently*..... only when it's necessary for performance
>>> .... we do want the queue to have some meat to it so the hardware
>>> can write bytes as fast as possible.....
>>>
>>> If we flush inside put_buffer (which is called very frequently):
>> Is it called at any time during RAM migration?
> 
> I don't understand the question: the flushing we've been discussing
> is *only* for RAM migration - not for the non-live state.

Yes.  But I would like to piggyback the final, full drain on the switch
from RAM migration to device migration.

>> Can you make drain a no-op if there is nothing in flight?  Then every
>> call to put_buffer after the first should not have any overhead.
> 
> That still doesn't solve the problem: If there is nothing in flight,
> then there is no reason to call qemu_fflush() in the first place.

If there is no RAM migration in flight.  So you have

   migrate RAM
   ...
   RAM migration finished, device migration start
   put_buffer <<<<< QEMUFileRDMA triggers drain
   put_buffer
   put_buffer
   put_buffer
   ...

> The flushes we need are only for RAM, not the rest of it......
> 
> Make sense?

Paolo
mrhines@linux.vnet.ibm.com - March 19, 2013, 3:02 p.m.
Consider the following sequence:

1. Boot fresh VM (say, a boring 1GB vm)                    => Resident 
set is small, say 100M
2. Touch all the memory (with a utility or something) => Resident set is ~1G
3. Send QMP "balloon 500" => Resident set is ~500M
4. Now, migrate the VM => Resident set is 1G again

This suggests to me that migration is not accounting for
what memory was ballooned.

I suspect this is because the migration_bitmap does not coordinate
with the list of ballooned-out memory that was MADVISED().

This affects RDMA as well as TCP on the sender side.

Is there any hard reason why we're not validating migration_bitmap against
the memory that was MADVISED()'d?

- Michael R. Hines
mrhines@linux.vnet.ibm.com - March 19, 2013, 3:12 p.m.
Actually, you don't even need ballooning to reproduce this behavior.

Is this a known issue?

- Michael


On 03/19/2013 11:02 AM, Michael R. Hines wrote:
> Consider the following sequence:
>
> 1. Boot fresh VM (say, a boring 1GB vm)                    => Resident 
> set is small, say 100M
> 2. Touch all the memory (with a utility or something) => Resident set 
> is ~1G
> 3. Send QMP "balloon 500" => Resident set is ~500M
> 4. Now, migrate the VM => Resident set is 1G again
>
> This suggests to me that migration is not accounting for
> what memory was ballooned.
>
> I suspect this is because the migration_bitmap does not coordinate
> with the list of ballooned-out memory that was MADVISED().
>
> This affects RDMA as well as TCP on the sender side.
>
> Is there any hard reason why we're not validating migration_bitmap 
> against
> the memory that was MADVISED()'d?
>
> - Michael R. Hines
>
>
Michael S. Tsirkin - March 19, 2013, 3:17 p.m.
On Tue, Mar 19, 2013 at 11:12:50AM -0400, Michael R. Hines wrote:
> Actually, you don't even need ballooning to reproduce this behavior.
> 
> Is this a known issue?
> 
> - Michael

Yes.
mrhines@linux.vnet.ibm.com - March 19, 2013, 6:27 p.m.
On 03/19/2013 10:22 AM, Paolo Bonzini wrote:
> Il 19/03/2013 15:10, Michael R. Hines ha scritto:
>> On 03/19/2013 09:45 AM, Paolo Bonzini wrote:
>>> This is because of downtime: You have to drain the queue anyway at the
>>> very end, and if you don't drain it in advance after each iteration, then
>>> the queue will have lots of bytes in it waiting for transmission and the
>>> Virtual Machine will be stopped for a much longer period of time during
>>> the last iteration waiting for RDMA card to finish transmission of all
>>> those
>>> bytes.
>>> Shouldn't the "current chunk full" case take care of it too?
>>>
>>> Of course if you disable chunking you have to add a different condition,
>>> perhaps directly into save_rdma_page.
>> No, we don't want to flush on "chunk full" - that has a different meaning.
>> We want to have as many chunks submitted to the hardware for transmission
>> as possible to keep the bytes moving.
> That however gives me an idea...  Instead of the full drain at the end
> of an iteration, does it make sense to do a "partial" drain at every
> chunk full, so that you don't have > N bytes pending and the downtime is
> correspondingly limited?


Sure, you could do that, but it seems overly complex just to avoid
a single flush() call at the end of each iteration, right?

> If there is no RAM migration in flight.  So you have
>
>     migrate RAM
>     ...
>     RAM migration finished, device migration start
>     put_buffer <<<<< QEMUFileRDMA triggers drain
>     put_buffer
>     put_buffer
>     put_buffer
>     ...

Ah, yes, ok. Very simple modification......
Paolo Bonzini - March 19, 2013, 6:40 p.m.
Il 19/03/2013 19:27, Michael R. Hines ha scritto:
>>>
>> That however gives me an idea...  Instead of the full drain at the end
>> of an iteration, does it make sense to do a "partial" drain at every
>> chunk full, so that you don't have > N bytes pending and the downtime is
>> correspondingly limited?
> 
> 
> Sure, you could do that, but it seems overly complex just to avoid
> a single flush() call at the end of each iteration, right?

Would it really be that complex?  Not having an extra QEMUFile op
perhaps balances that complexity (and the complexity remains hidden in
rdma.c, which is an advantage).

You could alternatively drain every N megabytes sent, or something like
that.  But a partial drain would help obeying the maximum downtime
limitations.

Paolo
Paolo Bonzini - March 20, 2013, 3:20 p.m.
Il 19/03/2013 19:40, Paolo Bonzini ha scritto:
>>> >> That however gives me an idea...  Instead of the full drain at the end
>>> >> of an iteration, does it make sense to do a "partial" drain at every
>>> >> chunk full, so that you don't have > N bytes pending and the downtime is
>>> >> correspondingly limited?
>> > 
>> > 
>> > Sure, you could do that, but it seems overly complex just to avoid
>> > a single flush() call at the end of each iteration, right?
> Would it really be that complex?  Not having an extra QEMUFile op
> perhaps balances that complexity (and the complexity remains hidden in
> rdma.c, which is an advantage).
> 
> You could alternatively drain every N megabytes sent, or something like
> that.  But a partial drain would help obeying the maximum downtime
> limitations.

On second thought: just keep the drain operation, but make it clear that
it is related to the new save_ram_page QEMUFileOps field.  You could
call it flush_ram_pages or something like that.

Paolo
mrhines@linux.vnet.ibm.com - March 20, 2013, 4:09 p.m.
On 03/20/2013 11:20 AM, Paolo Bonzini wrote:
> Il 19/03/2013 19:40, Paolo Bonzini ha scritto:
>>>>>> That however gives me an idea...  Instead of the full drain at the end
>>>>>> of an iteration, does it make sense to do a "partial" drain at every
>>>>>> chunk full, so that you don't have > N bytes pending and the downtime is
>>>>>> correspondingly limited?
>>>>
>>>> Sure, you could do that, but it seems overly complex just to avoid
>>>> a single flush() call at the end of each iteration, right?
>> Would it really be that complex?  Not having an extra QEMUFile op
>> perhaps balances that complexity (and the complexity remains hidden in
>> rdma.c, which is an advantage).
>>
>> You could alternatively drain every N megabytes sent, or something like
>> that.  But a partial drain would help obeying the maximum downtime
>> limitations.
> On second thought: just keep the drain operation, but make it clear that
> it is related to the new save_ram_page QEMUFileOps field.  You could
> call it flush_ram_pages or something like that.
>
> Paolo
>

Acknowledged. This helps a lot, thank you. I'll be sure to
clearly conditionalize everything in the next RFC.

Patch

diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index df81261..9046751 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -51,23 +51,33 @@  typedef int (QEMUFileCloseFunc)(void *opaque);
  */
 typedef int (QEMUFileGetFD)(void *opaque);
 
+/* 
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists). (Only used by RDMA right now)
+ */
+typedef int (QEMUFileDrainFunc)(void *opaque);
+
 typedef struct QEMUFileOps {
     QEMUFilePutBufferFunc *put_buffer;
     QEMUFileGetBufferFunc *get_buffer;
     QEMUFileCloseFunc *close;
     QEMUFileGetFD *get_fd;
+    QEMUFileDrainFunc *drain;
 } QEMUFileOps;
 
 QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops);
 QEMUFile *qemu_fopen(const char *filename, const char *mode);
 QEMUFile *qemu_fdopen(int fd, const char *mode);
 QEMUFile *qemu_fopen_socket(int fd, const char *mode);
+QEMUFile *qemu_fopen_rdma(void *opaque, const char *mode);
 QEMUFile *qemu_popen_cmd(const char *command, const char *mode);
 int qemu_get_fd(QEMUFile *f);
 int qemu_fclose(QEMUFile *f);
 int64_t qemu_ftell(QEMUFile *f);
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, int size);
 void qemu_put_byte(QEMUFile *f, int v);
+int qemu_drain(QEMUFile *f);
 
 static inline void qemu_put_ubyte(QEMUFile *f, unsigned int v)
 {
diff --git a/savevm.c b/savevm.c
index 35c8d1e..9b90b7f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -32,6 +32,7 @@ 
 #include "qemu/timer.h"
 #include "audio/audio.h"
 #include "migration/migration.h"
+#include "migration/rdma.h"
 #include "qemu/sockets.h"
 #include "qemu/queue.h"
 #include "sysemu/cpus.h"
@@ -143,6 +144,13 @@  typedef struct QEMUFileSocket
     QEMUFile *file;
 } QEMUFileSocket;
 
+typedef struct QEMUFileRDMA
+{
+    void *rdma;
+    size_t len;
+    QEMUFile *file;
+} QEMUFileRDMA;
+
 typedef struct {
     Coroutine *co;
     int fd;
@@ -178,6 +186,66 @@  static int socket_get_fd(void *opaque)
     return s->fd;
 }
 
+/*
+ * SEND messages for none-live state only.
+ * pc.ram is handled elsewhere...
+ */
+static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileRDMA *r = opaque;
+    size_t remaining = size;
+    uint8_t * data = (void *) buf;
+
+    /*
+     * Although we're sending non-live
+     * state here, push out any writes that
+     * we're queued up for pc.ram anyway.
+     */
+    if (qemu_rdma_write_flush(r->rdma) < 0)
+        return -EIO;
+
+    while(remaining) {
+        r->len = MIN(remaining, RDMA_SEND_INCREMENT);
+        remaining -= r->len;
+
+        if(qemu_rdma_exchange_send(r->rdma, data, r->len) < 0)
+                return -EINVAL;
+
+        data += r->len;
+    }
+
+    return size;
+} 
+
+/*
+ * RDMA links don't use bytestreams, so we have to
+ * return bytes to QEMUFile opportunistically.
+ */
+static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
+{
+    QEMUFileRDMA *r = opaque;
+
+    /*
+     * First, we hold on to the last SEND message we 
+     * were given and dish out the bytes until we run 
+     * out of bytes.
+     */
+    if((r->len = qemu_rdma_fill(r->rdma, buf, size)))
+	return r->len; 
+
+     /*
+      * Once we run out, we block and wait for another
+      * SEND message to arrive.
+      */
+    if(qemu_rdma_exchange_recv(r->rdma) < 0)
+	return -EINVAL;
+
+    /*
+     * SEND was received with new bytes, now try again.
+     */
+    return qemu_rdma_fill(r->rdma, buf, size);
+} 
+
 static int socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size)
 {
     QEMUFileSocket *s = opaque;
@@ -390,16 +458,24 @@  static const QEMUFileOps socket_write_ops = {
     .close =      socket_close
 };
 
-QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+static bool qemu_mode_is_not_valid(const char * mode)
 {
-    QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
-
     if (mode == NULL ||
         (mode[0] != 'r' && mode[0] != 'w') ||
         mode[1] != 'b' || mode[2] != 0) {
         fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
-        return NULL;
+        return true;
     }
+    
+    return false;
+}
+
+QEMUFile *qemu_fopen_socket(int fd, const char *mode)
+{
+    QEMUFileSocket *s = g_malloc0(sizeof(QEMUFileSocket));
+
+    if(qemu_mode_is_not_valid(mode))
+	return NULL;
 
     s->fd = fd;
     if (mode[0] == 'w') {
@@ -411,16 +487,66 @@  QEMUFile *qemu_fopen_socket(int fd, const char *mode)
     return s->file;
 }
 
+static int qemu_rdma_close(void *opaque)
+{
+    QEMUFileRDMA *r = opaque;
+    if(r->rdma) {
+        qemu_rdma_cleanup(r->rdma);
+        g_free(r->rdma);
+    }
+    g_free(r);
+    return 0;
+}
+
+void * migrate_use_rdma(QEMUFile *f)
+{
+    QEMUFileRDMA *r = f->opaque;
+
+    return qemu_rdma_enabled(r->rdma) ? r->rdma : NULL;
+}
+
+static int qemu_rdma_drain_completion(void *opaque)
+{
+    QEMUFileRDMA *r = opaque;
+    r->len = 0;
+    return qemu_rdma_drain_cq(r->rdma);
+}
+
+static const QEMUFileOps rdma_read_ops = {
+    .get_buffer = qemu_rdma_get_buffer,
+    .close =      qemu_rdma_close,
+};
+
+static const QEMUFileOps rdma_write_ops = {
+    .put_buffer = qemu_rdma_put_buffer,
+    .close =      qemu_rdma_close,
+    .drain =	  qemu_rdma_drain_completion,
+};
+
+QEMUFile *qemu_fopen_rdma(void *opaque, const char * mode)
+{
+    QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
+
+    if(qemu_mode_is_not_valid(mode))
+	return NULL;
+
+    r->rdma = opaque;
+
+    if (mode[0] == 'w') {
+        r->file = qemu_fopen_ops(r, &rdma_write_ops);
+    } else {
+        r->file = qemu_fopen_ops(r, &rdma_read_ops);
+    }
+
+    return r->file;
+}
+
 QEMUFile *qemu_fopen(const char *filename, const char *mode)
 {
     QEMUFileStdio *s;
 
-    if (mode == NULL ||
-	(mode[0] != 'r' && mode[0] != 'w') ||
-	mode[1] != 'b' || mode[2] != 0) {
-        fprintf(stderr, "qemu_fopen: Argument validity check failed\n");
-        return NULL;
-    }
+    if(qemu_mode_is_not_valid(mode))
+	return NULL;
 
     s = g_malloc0(sizeof(QEMUFileStdio));
 
@@ -497,6 +623,24 @@  static void qemu_file_set_error(QEMUFile *f, int ret)
     }
 }
 
+/*
+ * Called only for RDMA right now at the end 
+ * of each live iteration of memory.
+ *
+ * 'drain' from a QEMUFile perspective means
+ * to flush the outbound send buffer
+ * (if one exists). 
+ *
+ * For RDMA, this means to make sure we've
+ * received completion queue (CQ) messages
+ * successfully for all of the RDMA writes
+ * that we requested.
+ */ 
+int qemu_drain(QEMUFile *f)
+{
+    return f->ops->drain ? f->ops->drain(f->opaque) : 0;
+}
+
 /** Flushes QEMUFile buffer
  *
  */
@@ -723,6 +867,8 @@  int qemu_get_byte(QEMUFile *f)
 int64_t qemu_ftell(QEMUFile *f)
 {
     qemu_fflush(f);
+    if(migrate_use_rdma(f))
+	return delta_norm_mig_bytes_transferred();
     return f->pos;
 }
 
@@ -1737,6 +1883,12 @@  void qemu_savevm_state_complete(QEMUFile *f)
         }
     }
 
+    if ((ret = qemu_drain(f)) < 0) {
+	fprintf(stderr, "failed to drain RDMA first!\n");
+        qemu_file_set_error(f, ret);
+	return;
+    }
+
     QTAILQ_FOREACH(se, &savevm_handlers, entry) {
         int len;