diff mbox series

[RFC,v1,2/3] qemu-file: add buffered mode

Message ID 1586776334-641239-3-git-send-email-dplotnikov@virtuozzo.com
State New
Headers show
Series qemu-file writing performance improving | expand

Commit Message

Denis Plotnikov April 13, 2020, 11:12 a.m. UTC
The patch adds ability to qemu-file to write the data
asynchronously to improve the performance on writing.
Before, only synchronous writing was supported.

Enabling of the asyncronous mode is managed by new
"enabled_buffered" callback.

Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
---
 include/qemu/typedefs.h |   1 +
 migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
 migration/qemu-file.h   |   9 ++
 3 files changed, 339 insertions(+), 22 deletions(-)

Comments

Eric Blake April 24, 2020, 9:25 p.m. UTC | #1
On 4/13/20 6:12 AM, Denis Plotnikov wrote:
> The patch adds ability to qemu-file to write the data
> asynchronously to improve the performance on writing.
> Before, only synchronous writing was supported.
> 
> Enabling of the asyncronous mode is managed by new

asynchronous

> "enabled_buffered" callback.

The term "enabled_buffered" does not appear in the patch.  Did you mean...

> 
> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
> ---
>   include/qemu/typedefs.h |   1 +
>   migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
>   migration/qemu-file.h   |   9 ++
>   3 files changed, 339 insertions(+), 22 deletions(-)
> 

> @@ -60,6 +66,22 @@ struct QEMUFile {
>       bool shutdown;
>       /* currently used buffer */
>       QEMUFileBuffer *current_buf;
> +    /*
> +     * with buffered_mode enabled all the data copied to 512 byte
> +     * aligned buffer, including iov data. Then the buffer is passed
> +     * to writev_buffer callback.
> +     */
> +    bool buffered_mode;

..."Asynchronous mode is managed by setting the new buffered_mode flag"? 
  ...


> +    /* for async buffer writing */
> +    AioTaskPool *pool;
> +    /* the list of free buffers, currently used on is NOT there */

s/on/one/

> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
> +};
> +
> +struct QEMUFileAioTask {
> +    AioTask task;
> +    QEMUFile *f;
> +    QEMUFileBuffer *fb;
>   };
>   
>   /*
> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
>       f->opaque = opaque;
>       f->ops = ops;
>   
> -    f->current_buf = g_new0(QEMUFileBuffer, 1);
> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> +    if (f->ops->enable_buffered) {
> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);

...ah, you meant 'enable_buffered'.  But still, why do we need a 
callback function?  Is it not sufficient to just have a bool flag?


> +static size_t get_buf_free_size(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* buf_index can't be greated than buf_size */

greater

> +    assert(fb->buf_size >= fb->buf_index);
> +    return fb->buf_size - fb->buf_index;
> +}
> +

> +static int write_task_fn(AioTask *task)
> +{

> +    /*
> +     * Increment file position.
> +     * This needs to be here before calling writev_buffer, because
> +     * writev_buffer is asynchronous and there could be more than one
> +     * writev_buffer started simultaniously. Each writev_buffer should

simultaneously

> +     * use its own file pos to write to. writev_buffer may write less
> +     * than buf_index bytes but we treat this situation as an error.
> +     * If error appeared, further file using is meaningless.

s/using/use/

> +     * We expect that, the most of the time the full buffer is written,
> +     * (when buf_size == buf_index). The only case when the non-full
> +     * buffer is written (buf_size != buf_index) is file close,
> +     * when we need to flush the rest of the buffer content.

We expect that most of the time, the full buffer will be written 
(buf_size == buf_index), with the exception at file close where we need 
to flush the final partial buffer.

> +     */
> +    f->pos += fb->buf_index;
> +
> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
> +
> +    /* return the just written buffer to the free list */
> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> +
> +    /* check that we have written everything */
> +    if (ret != fb->buf_index) {
> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
> +    }
> +
> +    /*
> +     * always return 0 - don't use task error handling, relay on

rely

> +     * qemu file error handling
> +     */
> +    return 0;
> +}
> +
> +static void qemu_file_switch_current_buf(QEMUFile *f)
> +{
> +    /*
> +     * if the list is empty, wait until some task returns a buffer
> +     * to the list of free buffers.
> +     */
> +    if (QLIST_EMPTY(&f->free_buffers)) {
> +        aio_task_pool_wait_slot(f->pool);
> +    }
> +
> +    /*
> +     * sanity check that the list isn't empty
> +     * if the free list was empty, we waited for a task complition,

completion

> +     * and the pompleted task must return a buffer to a list of free buffers

completed

> +     */
> +    assert(!QLIST_EMPTY(&f->free_buffers));
> +
> +    /* set the current buffer for using from the free list */
> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
> +    reset_buf(f);
> +
> +    QLIST_REMOVE(f->current_buf, link);
> +}
> +

>   
>   /*
> + * Copy an external buffer to the intenal current buffer.

internal

> + */
> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
> +                     bool may_free)
> +{

> +++ b/migration/qemu-file.h
> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>   typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>                                      Error **errp);
>   
> +/*
> + * Enables or disables the buffered mode
> + * Existing blocking reads/writes must be woken
> + * Returns true if the buffered mode has to be enabled,
> + * false if it has to be disabled.
> + */
> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);

If this never gets called outside of initial creation of the QemuFile 
(that is, it is not dynamic), then making it a straight flag instead of 
a callback function is simpler.

> +
>   typedef struct QEMUFileOps {
>       QEMUFileGetBufferFunc *get_buffer;
>       QEMUFileCloseFunc *close;
> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>       QEMUFileWritevBufferFunc *writev_buffer;
>       QEMURetPathFunc *get_return_path;
>       QEMUFileShutdownFunc *shut_down;
> +    QEMUFileEnableBufferedFunc *enable_buffered;
>   } QEMUFileOps;
>   
>   typedef struct QEMUFileHooks {
>
Vladimir Sementsov-Ogievskiy April 25, 2020, 9:10 a.m. UTC | #2
13.04.2020 14:12, Denis Plotnikov wrote:
> The patch adds ability to qemu-file to write the data
> asynchronously to improve the performance on writing.
> Before, only synchronous writing was supported.
> 
> Enabling of the asyncronous mode is managed by new
> "enabled_buffered" callback.

Hmm.

I don't like resulting architecture very much:

1. Function naming is not clean: how can I understand that copy_buf is for buffered mode when add_to_iovec is +- same thing for non-buffered mode?

Hmm actually, you just alter several significant functions of QEMUFile - open, close, put, flush. In old mode we do one thing, in a new mode - absolutely another. This looks like a driver. So may be we want to add QEMUFileDriver struct, to define these functions as callbacks, move old realizations to default driver, and add new functionality as a new driver, what do you think?

2. Terminology: you say you add buffered mode, but actually qemu file is already work in buffered mode, so it should be clarified somehow..
You also add asynchronisity, but old implementation has already qemu_put_buffer_async..
You use aio task pool, but don't say that it may be used only from coroutine.
May be, we'd better call it "coroutine-mode" ?


Also, am I correct that new mode can't be used for read? Should we document/assert it somehow? Or may be support reading? Will switch to snapshot benefit if we implement reading in a new mode?

> 
> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
> ---
>   include/qemu/typedefs.h |   1 +
>   migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
>   migration/qemu-file.h   |   9 ++
>   3 files changed, 339 insertions(+), 22 deletions(-)
> 
> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
> index 88dce54..9b388c8 100644
> --- a/include/qemu/typedefs.h
> +++ b/include/qemu/typedefs.h
> @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
>   typedef struct QemuConsole QemuConsole;
>   typedef struct QEMUFile QEMUFile;
>   typedef struct QEMUFileBuffer QEMUFileBuffer;
> +typedef struct QEMUFileAioTask QEMUFileAioTask;
>   typedef struct QemuLockable QemuLockable;
>   typedef struct QemuMutex QemuMutex;
>   typedef struct QemuOpt QemuOpt;
> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> index 285c6ef..f42f949 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -29,19 +29,25 @@
>   #include "qemu-file.h"
>   #include "trace.h"
>   #include "qapi/error.h"
> +#include "block/aio_task.h"
>   
> -#define IO_BUF_SIZE 32768
> +#define IO_BUF_SIZE (1024 * 1024)
>   #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
> +#define IO_BUF_NUM 2

Interesting, how much is it better than if we set to 1, limiting the influence of the series to alignment of written chunks?

> +#define IO_BUF_ALIGNMENT 512
>   
> -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
> +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
> +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
> +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
>   
>   struct QEMUFileBuffer {
>       int buf_index;
> -    int buf_size; /* 0 when writing */
> +    int buf_size; /* 0 when non-buffered writing */
>       uint8_t *buf;
>       unsigned long *may_free;
>       struct iovec *iov;
>       unsigned int iovcnt;
> +    QLIST_ENTRY(QEMUFileBuffer) link;
>   };
>   
>   struct QEMUFile {
> @@ -60,6 +66,22 @@ struct QEMUFile {
>       bool shutdown;
>       /* currently used buffer */
>       QEMUFileBuffer *current_buf;
> +    /*
> +     * with buffered_mode enabled all the data copied to 512 byte
> +     * aligned buffer, including iov data. Then the buffer is passed
> +     * to writev_buffer callback.
> +     */
> +    bool buffered_mode;
> +    /* for async buffer writing */
> +    AioTaskPool *pool;
> +    /* the list of free buffers, currently used on is NOT there */
> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
> +};
> +
> +struct QEMUFileAioTask {
> +    AioTask task;
> +    QEMUFile *f;
> +    QEMUFileBuffer *fb;
>   };
>   
>   /*
> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
>       f->opaque = opaque;
>       f->ops = ops;
>   
> -    f->current_buf = g_new0(QEMUFileBuffer, 1);
> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> +    if (f->ops->enable_buffered) {
> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
> +    }
> +
> +    if (f->buffered_mode && qemu_file_is_writable(f)) {

So we actually go to buffered_mode if file is writable. Then, shouldn't we otherwise set buffered_mode to false otherwise?

> +        int i;
> +        /*
> +         * in buffered_mode we don't use internal io vectors
> +         * and may_free bitmap, because we copy the data to be
> +         * written right away to the buffer
> +         */
> +        f->pool = aio_task_pool_new(IO_BUF_NUM);
> +
> +        /* allocate io buffers */
> +        for (i = 0; i < IO_BUF_NUM; i++) {
> +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
> +
> +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
> +            fb->buf_size = IO_BUF_SIZE;
> +
> +            /*
> +             * put the first buffer to the current buf and the rest
> +             * to the list of free buffers
> +             */
> +            if (i == 0) {
> +                f->current_buf = fb;
> +            } else {
> +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> +            }
> +        }
> +    } else {
> +        f->current_buf = g_new0(QEMUFileBuffer, 1);
> +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> +    }
>   
>       return f;
>   }
> @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>       unsigned long idx;
>       QEMUFileBuffer *fb = f->current_buf;
>   
> +    assert(!f->buffered_mode);
> +
>       /* Find and release all the contiguous memory ranges marked as may_free. */
>       idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
>       if (idx >= fb->iovcnt) {
> @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>       bitmap_zero(fb->may_free, MAX_IOV_SIZE);
>   }
>   
> +static void advance_buf_ptr(QEMUFile *f, size_t size)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* must not advance to 0 */
> +    assert(size);
> +    /* must not overflow buf_index (int) */
> +    assert(fb->buf_index + size <= INT_MAX);

to not overflow in check: assert(fb->buf_index <= INT_MAX - size)

> +    /* must not exceed buf_size */
> +    assert(fb->buf_index + size <= fb->buf_size);
> +
> +    fb->buf_index += size;
> +}
> +
> +static size_t get_buf_free_size(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* buf_index can't be greated than buf_size */
> +    assert(fb->buf_size >= fb->buf_index);
> +    return fb->buf_size - fb->buf_index;
> +}
> +
> +static size_t get_buf_used_size(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    return fb->buf_index;
> +}
> +
> +static uint8_t *get_buf_ptr(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* protects from out of bound reading */
> +    assert(fb->buf_index <= IO_BUF_SIZE);
> +    return fb->buf + fb->buf_index;
> +}
> +
> +static bool buf_is_full(QEMUFile *f)
> +{
> +    return get_buf_free_size(f) == 0;
> +}
> +
> +static void reset_buf(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    fb->buf_index = 0;
> +}
> +
> +static int write_task_fn(AioTask *task)
> +{
> +    int ret;
> +    Error *local_error = NULL;
> +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
> +    QEMUFile *f = t->f;
> +    QEMUFileBuffer *fb = t->fb;
> +    uint64_t pos = f->pos;
> +    struct iovec v = (struct iovec) {
> +        .iov_base = fb->buf,
> +        .iov_len = fb->buf_index,
> +    };
> +
> +    assert(f->buffered_mode);
> +
> +    /*
> +     * Increment file position.
> +     * This needs to be here before calling writev_buffer, because
> +     * writev_buffer is asynchronous and there could be more than one
> +     * writev_buffer started simultaniously. Each writev_buffer should
> +     * use its own file pos to write to. writev_buffer may write less
> +     * than buf_index bytes but we treat this situation as an error.
> +     * If error appeared, further file using is meaningless.
> +     * We expect that, the most of the time the full buffer is written,
> +     * (when buf_size == buf_index). The only case when the non-full
> +     * buffer is written (buf_size != buf_index) is file close,
> +     * when we need to flush the rest of the buffer content.
> +     */
> +    f->pos += fb->buf_index;

Seems safer to add pos to QEMUFileAioTask instead, and manage global f->pos
in main coroutine, not in tasks.

> +
> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
> +
> +    /* return the just written buffer to the free list */
> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> +
> +    /* check that we have written everything */
> +    if (ret != fb->buf_index) {
> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
> +    }
> +
> +    /*
> +     * always return 0 - don't use task error handling, relay on
> +     * qemu file error handling
> +     */
> +    return 0;
> +}
> +
> +static void qemu_file_switch_current_buf(QEMUFile *f)
> +{
> +    /*
> +     * if the list is empty, wait until some task returns a buffer
> +     * to the list of free buffers.
> +     */
> +    if (QLIST_EMPTY(&f->free_buffers)) {
> +        aio_task_pool_wait_slot(f->pool);
> +    }
> +
> +    /*
> +     * sanity check that the list isn't empty
> +     * if the free list was empty, we waited for a task complition,
> +     * and the pompleted task must return a buffer to a list of free buffers
> +     */
> +    assert(!QLIST_EMPTY(&f->free_buffers));
> +
> +    /* set the current buffer for using from the free list */
> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
> +    reset_buf(f);
> +
> +    QLIST_REMOVE(f->current_buf, link);
> +}
> +
> +/**
> + *  Asynchronously flushes QEMUFile buffer
> + *
> + * This will flush all pending data. If data was only partially flushed, it
> + * will set an error state. The function may return before the data actually
> + * written.
> + */
> +static void flush_buffer(QEMUFile *f)
> +{
> +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
> +
> +    *t = (QEMUFileAioTask) {
> +        .task.func = &write_task_fn,
> +        .f = f,
> +        .fb = f->current_buf,
> +    };
> +
> +    /* aio_task_pool should free t for us */
> +    aio_task_pool_start_task(f->pool, (AioTask *) t);
> +
> +    /* if no errors this will switch the buffer */
> +    qemu_file_switch_current_buf(f);
> +}
> +
>   /**
>    * Flushes QEMUFile buffer
>    *
> @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
>       if (f->shutdown) {
>           return;
>       }
> +
> +    if (f->buffered_mode) {
> +        return;

I don't think it's correct. qemu_fflush is public interface of QEMUFile and it's assumed to write all in-flight data.

> +    }
> +
>       if (fb->iovcnt > 0) {
> +        /* this is non-buffered mode */
>           expect = iov_size(fb->iov, fb->iovcnt);
>           ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
>                                       &local_error);
> @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>   
>   void qemu_update_position(QEMUFile *f, size_t size)
>   {
> +    assert(!f->buffered_mode);

Public interface. Why are you sure that it's not used in snapshot? It is used in migration/ram.c ...

>       f->pos += size;

And it shouldn't be the problem for the new mode, just ass pos parameter to QEMUFileAioTask,

>   }
>   
> @@ -392,7 +596,18 @@ void qemu_update_position(QEMUFile *f, size_t size)
>   int qemu_fclose(QEMUFile *f)
>   {
>       int ret;
> -    qemu_fflush(f);
> +
> +    if (qemu_file_is_writable(f) && f->buffered_mode) {
> +        ret = qemu_file_get_error(f);
> +        if (!ret) {
> +            flush_buffer(f);
> +        }
> +        /* wait until all tasks are done */
> +        aio_task_pool_wait_all(f->pool);
> +    } else {
> +        qemu_fflush(f);
> +    }

Again, not clean thing: for buffered mode we use flush_buffer, for non-buffered we use qemu_fflush.. It's not clean from function naming.

> +
>       ret = qemu_file_get_error(f);
>   
>       if (f->ops->close) {
> @@ -408,16 +623,77 @@ int qemu_fclose(QEMUFile *f)
>           ret = f->last_error;
>       }
>       error_free(f->last_error_obj);
> -    g_free(f->current_buf->buf);
> -    g_free(f->current_buf->iov);
> -    g_free(f->current_buf->may_free);
> -    g_free(f->current_buf);
> +
> +    if (f->buffered_mode) {
> +        QEMUFileBuffer *fb, *next;
> +        /*
> +         * put the current back to the free buffers list
> +         * to destroy all the buffers in one loop
> +         */
> +        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
> +
> +        /* destroy all the buffers */
> +        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
> +            QLIST_REMOVE(fb, link);
> +            /* looks like qemu_vfree pairs with qemu_memalign */
> +            qemu_vfree(fb->buf);
> +            g_free(fb);
> +        }
> +        g_free(f->pool);
> +    } else {
> +        g_free(f->current_buf->buf);
> +        g_free(f->current_buf->iov);
> +        g_free(f->current_buf->may_free);
> +        g_free(f->current_buf);
> +    }
> +
>       g_free(f);
>       trace_qemu_file_fclose();
>       return ret;
>   }
>   
>   /*
> + * Copy an external buffer to the intenal current buffer.
> + */
> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
> +                     bool may_free)
> +{
> +    size_t data_size = size;
> +    const uint8_t *src_ptr = buf;
> +
> +    assert(f->buffered_mode);
> +    assert(size <= INT_MAX);
> +
> +    while (data_size > 0) {
> +        size_t chunk_size;
> +
> +        if (buf_is_full(f)) {
> +            flush_buffer(f);
> +            if (qemu_file_get_error(f)) {
> +                return;
> +            }
> +        }
> +
> +        chunk_size = MIN(get_buf_free_size(f), data_size);
> +
> +        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
> +
> +        advance_buf_ptr(f, chunk_size);
> +
> +        src_ptr += chunk_size;
> +        data_size -= chunk_size;
> +        f->bytes_xfer += chunk_size;
> +    }
> +
> +    if (may_free) {
> +        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
> +            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
> +                         buf, size, strerror(errno));
> +        }
> +    }
> +}
> +
> +/*
>    * Add buf to iovec. Do flush if iovec is full.
>    *
>    * Return values:
> @@ -454,6 +730,9 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
>   static void add_buf_to_iovec(QEMUFile *f, size_t len)
>   {
>       QEMUFileBuffer *fb = f->current_buf;
> +
> +    assert(!f->buffered_mode);
> +
>       if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
>           fb->buf_index += len;
>           if (fb->buf_index == IO_BUF_SIZE) {
> @@ -469,8 +748,12 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
>           return;
>       }
>   
> -    f->bytes_xfer += size;
> -    add_to_iovec(f, buf, size, may_free);
> +    if (f->buffered_mode) {
> +        copy_buf(f, buf, size, may_free);
> +    } else {
> +        f->bytes_xfer += size;
> +        add_to_iovec(f, buf, size, may_free);
> +    }
>   }
>   
>   void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
> @@ -482,6 +765,11 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>           return;
>       }
>   
> +    if (f->buffered_mode) {
> +        copy_buf(f, buf, size, false);
> +        return;
> +    }
> +
>       while (size > 0) {
>           l = IO_BUF_SIZE - fb->buf_index;
>           if (l > size) {
> @@ -506,15 +794,21 @@ void qemu_put_byte(QEMUFile *f, int v)
>           return;
>       }
>   
> -    fb->buf[fb->buf_index] = v;
> -    f->bytes_xfer++;
> -    add_buf_to_iovec(f, 1);
> +    if (f->buffered_mode) {
> +        copy_buf(f, (const uint8_t *) &v, 1, false);
> +    } else {
> +        fb->buf[fb->buf_index] = v;
> +        add_buf_to_iovec(f, 1);
> +        f->bytes_xfer++;
> +    }
>   }
>   
>   void qemu_file_skip(QEMUFile *f, int size)
>   {
>       QEMUFileBuffer *fb = f->current_buf;
>   
> +    assert(!f->buffered_mode);
> +
>       if (fb->buf_index + size <= fb->buf_size) {
>           fb->buf_index += size;
>       }
> @@ -672,10 +966,14 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>   {
>       int64_t ret = f->pos;
>       int i;
> -    QEMUFileBuffer *fb = f->current_buf;
>   
> -    for (i = 0; i < fb->iovcnt; i++) {
> -        ret += fb->iov[i].iov_len;
> +    if (f->buffered_mode) {
> +        ret += get_buf_used_size(f);
> +    } else {
> +        QEMUFileBuffer *fb = f->current_buf;
> +        for (i = 0; i < fb->iovcnt; i++) {
> +            ret += fb->iov[i].iov_len;
> +        }
>       }
>   
>       return ret;
> @@ -683,8 +981,12 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>   
>   int64_t qemu_ftell(QEMUFile *f)
>   {
> -    qemu_fflush(f);
> -    return f->pos;
> +    if (f->buffered_mode) {
> +        return qemu_ftell_fast(f);
> +    } else {
> +        qemu_fflush(f);
> +        return f->pos;
> +    }
>   }
>   
>   int qemu_file_rate_limit(QEMUFile *f)
> @@ -803,6 +1105,8 @@ ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
>       QEMUFileBuffer *fb = f->current_buf;
>       ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
>   
> +    assert(!f->buffered_mode);
> +
>       if (blen < compressBound(size)) {
>           return -1;
>       }
> @@ -827,6 +1131,9 @@ int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src)
>       int len = 0;
>       QEMUFileBuffer *fb_src = f_src->current_buf;
>   
> +    assert(!f_des->buffered_mode);
> +    assert(!f_src->buffered_mode);
> +
>       if (fb_src->buf_index > 0) {
>           len = fb_src->buf_index;
>           qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
> diff --git a/migration/qemu-file.h b/migration/qemu-file.h
> index a9b6d6c..08655d2 100644
> --- a/migration/qemu-file.h
> +++ b/migration/qemu-file.h
> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>   typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>                                      Error **errp);
>   
> +/*
> + * Enables or disables the buffered mode
> + * Existing blocking reads/writes must be woken
> + * Returns true if the buffered mode has to be enabled,
> + * false if it has to be disabled.
> + */
> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
> +
>   typedef struct QEMUFileOps {
>       QEMUFileGetBufferFunc *get_buffer;
>       QEMUFileCloseFunc *close;
> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>       QEMUFileWritevBufferFunc *writev_buffer;
>       QEMURetPathFunc *get_return_path;
>       QEMUFileShutdownFunc *shut_down;
> +    QEMUFileEnableBufferedFunc *enable_buffered;
>   } QEMUFileOps;
>   
>   typedef struct QEMUFileHooks {
>
Denis Plotnikov April 27, 2020, 8:19 a.m. UTC | #3
On 25.04.2020 12:10, Vladimir Sementsov-Ogievskiy wrote:
> 13.04.2020 14:12, Denis Plotnikov wrote:
>> The patch adds ability to qemu-file to write the data
>> asynchronously to improve the performance on writing.
>> Before, only synchronous writing was supported.
>>
>> Enabling of the asyncronous mode is managed by new
>> "enabled_buffered" callback.
>
> Hmm.
>
> I don't like resulting architecture very much:
>
> 1. Function naming is not clean: how can I understand that copy_buf is 
> for buffered mode when add_to_iovec is +- same thing for non-buffered 
> mode?
Yes, this is to be changed in the next patches
>
> Hmm actually, you just alter several significant functions of QEMUFile 
> - open, close, put, flush. In old mode we do one thing, in a new mode 
> - absolutely another. This looks like a driver. So may be we want to 
> add QEMUFileDriver struct, to define these functions as callbacks, 
> move old realizations to default driver, and add new functionality as 
> a new driver, what do you think?
Yes it looks like that, but on the other hand I can't imagine another 
driver to be added and changing the code to "the driver notation" would 
involve more code adding. So, should we really do that. Anyway, your 
suggestion looks cleaner.
>
> 2. Terminology: you say you add buffered mode, but actually qemu file 
> is already work in buffered mode, so it should be clarified somehow..
The initial implementation uses mixed implementation, it has a buffer 
and an iovec array. The buffer is used to store *some* parts (ecluding 
RAM) of a vm state + service information. Each written to the buffer is 
added to the iovec array as a separate entry. RAM pages aren't added to 
the buffer, instead they are added to the iovec array directly without 
coping to the buffer. This is why we almost always get the iovec array 
consisting of size- and pointer- unaligned iovec-s and why we have the 
performance issues (more detailed in 0000 of this series).
So, I can't say that the qemu-file has "real" buffered-mode.
> You also add asynchronisity, but old implementation has already 
> qemu_put_buffer_async..
In my opinion, this function name is kind of confusing. What the 
function does is adding the buffer pointer to the internal iovec array 
without coping the buffer. It's not related to some asynchronous operation.
> You use aio task pool, but don't say that it may be used only from 
> coroutine.
> May be, we'd better call it "coroutine-mode" ?
I don't think that mentioning any implementation-specific name is a good 
idea. aio_task_pool is a good option to implement async operation, but 
it can be any other interface.
I'd rather implicitly assert qemu_in_coroutine() when in "buffered-mode".
>
>
> Also, am I correct that new mode can't be used for read? Should we 
> document/assert it somehow? 
It can't just for now since read-buffered mode isn't implemented. 
Enabling buffered-mode for reading affects nothing - qemu-file works as 
before.
The same qemu-file instance can't  be opened for reading and writing at 
the same time. I'd add that assert on qemu-file open.
> Or may be support reading? Will switch to snapshot benefit if we 
> implement reading in a new mode?
I thought about that. I think it could benefit making some kind of 
read-ahead to a number of buffers, while the "current" buffer is used to 
fill a vm state.
I didn't want to focus on the reading improvements because it's not that 
big problem in comparison to the writing. The qemu-file reading uses a 
buffer and fills  that buffer with a single io operation.

>
>>
>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
>> ---
>>   include/qemu/typedefs.h |   1 +
>>   migration/qemu-file.c   | 351 
>> +++++++++++++++++++++++++++++++++++++++++++++---
>>   migration/qemu-file.h   |   9 ++
>>   3 files changed, 339 insertions(+), 22 deletions(-)
>>
>> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
>> index 88dce54..9b388c8 100644
>> --- a/include/qemu/typedefs.h
>> +++ b/include/qemu/typedefs.h
>> @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
>>   typedef struct QemuConsole QemuConsole;
>>   typedef struct QEMUFile QEMUFile;
>>   typedef struct QEMUFileBuffer QEMUFileBuffer;
>> +typedef struct QEMUFileAioTask QEMUFileAioTask;
>>   typedef struct QemuLockable QemuLockable;
>>   typedef struct QemuMutex QemuMutex;
>>   typedef struct QemuOpt QemuOpt;
>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> index 285c6ef..f42f949 100644
>> --- a/migration/qemu-file.c
>> +++ b/migration/qemu-file.c
>> @@ -29,19 +29,25 @@
>>   #include "qemu-file.h"
>>   #include "trace.h"
>>   #include "qapi/error.h"
>> +#include "block/aio_task.h"
>>   -#define IO_BUF_SIZE 32768
>> +#define IO_BUF_SIZE (1024 * 1024)
>>   #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
>> +#define IO_BUF_NUM 2
>
> Interesting, how much is it better than if we set to 1, limiting the 
> influence of the series to alignment of written chunks?
>> +#define IO_BUF_ALIGNMENT 512
>>   -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
>> +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
>> +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
>> +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
>>     struct QEMUFileBuffer {
>>       int buf_index;
>> -    int buf_size; /* 0 when writing */
>> +    int buf_size; /* 0 when non-buffered writing */
>>       uint8_t *buf;
>>       unsigned long *may_free;
>>       struct iovec *iov;
>>       unsigned int iovcnt;
>> +    QLIST_ENTRY(QEMUFileBuffer) link;
>>   };
>>     struct QEMUFile {
>> @@ -60,6 +66,22 @@ struct QEMUFile {
>>       bool shutdown;
>>       /* currently used buffer */
>>       QEMUFileBuffer *current_buf;
>> +    /*
>> +     * with buffered_mode enabled all the data copied to 512 byte
>> +     * aligned buffer, including iov data. Then the buffer is passed
>> +     * to writev_buffer callback.
>> +     */
>> +    bool buffered_mode;
>> +    /* for async buffer writing */
>> +    AioTaskPool *pool;
>> +    /* the list of free buffers, currently used on is NOT there */
>> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
>> +};
>> +
>> +struct QEMUFileAioTask {
>> +    AioTask task;
>> +    QEMUFile *f;
>> +    QEMUFileBuffer *fb;
>>   };
>>     /*
>> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const 
>> QEMUFileOps *ops)
>>       f->opaque = opaque;
>>       f->ops = ops;
>>   -    f->current_buf = g_new0(QEMUFileBuffer, 1);
>> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>> +    if (f->ops->enable_buffered) {
>> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
>> +    }
>> +
>> +    if (f->buffered_mode && qemu_file_is_writable(f)) {
>
> So we actually go to buffered_mode if file is writable. Then, 
> shouldn't we otherwise set buffered_mode to false otherwise?
buffered_mode is initialized with false
>
>> +        int i;
>> +        /*
>> +         * in buffered_mode we don't use internal io vectors
>> +         * and may_free bitmap, because we copy the data to be
>> +         * written right away to the buffer
>> +         */
>> +        f->pool = aio_task_pool_new(IO_BUF_NUM);
>> +
>> +        /* allocate io buffers */
>> +        for (i = 0; i < IO_BUF_NUM; i++) {
>> +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
>> +
>> +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
>> +            fb->buf_size = IO_BUF_SIZE;
>> +
>> +            /*
>> +             * put the first buffer to the current buf and the rest
>> +             * to the list of free buffers
>> +             */
>> +            if (i == 0) {
>> +                f->current_buf = fb;
>> +            } else {
>> +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>> +            }
>> +        }
>> +    } else {
>> +        f->current_buf = g_new0(QEMUFileBuffer, 1);
>> +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>> +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>> +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>> +    }
>>         return f;
>>   }
>> @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>       unsigned long idx;
>>       QEMUFileBuffer *fb = f->current_buf;
>>   +    assert(!f->buffered_mode);
>> +
>>       /* Find and release all the contiguous memory ranges marked as 
>> may_free. */
>>       idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
>>       if (idx >= fb->iovcnt) {
>> @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>       bitmap_zero(fb->may_free, MAX_IOV_SIZE);
>>   }
>>   +static void advance_buf_ptr(QEMUFile *f, size_t size)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* must not advance to 0 */
>> +    assert(size);
>> +    /* must not overflow buf_index (int) */
>> +    assert(fb->buf_index + size <= INT_MAX);
>
> to not overflow in check: assert(fb->buf_index <= INT_MAX - size)
good catch
>
>> +    /* must not exceed buf_size */
>> +    assert(fb->buf_index + size <= fb->buf_size);
>> +
>> +    fb->buf_index += size;
>> +}
>> +
>> +static size_t get_buf_free_size(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* buf_index can't be greated than buf_size */
>> +    assert(fb->buf_size >= fb->buf_index);
>> +    return fb->buf_size - fb->buf_index;
>> +}
>> +
>> +static size_t get_buf_used_size(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    return fb->buf_index;
>> +}
>> +
>> +static uint8_t *get_buf_ptr(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* protects from out of bound reading */
>> +    assert(fb->buf_index <= IO_BUF_SIZE);
>> +    return fb->buf + fb->buf_index;
>> +}
>> +
>> +static bool buf_is_full(QEMUFile *f)
>> +{
>> +    return get_buf_free_size(f) == 0;
>> +}
>> +
>> +static void reset_buf(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    fb->buf_index = 0;
>> +}
>> +
>> +static int write_task_fn(AioTask *task)
>> +{
>> +    int ret;
>> +    Error *local_error = NULL;
>> +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
>> +    QEMUFile *f = t->f;
>> +    QEMUFileBuffer *fb = t->fb;
>> +    uint64_t pos = f->pos;
>> +    struct iovec v = (struct iovec) {
>> +        .iov_base = fb->buf,
>> +        .iov_len = fb->buf_index,
>> +    };
>> +
>> +    assert(f->buffered_mode);
>> +
>> +    /*
>> +     * Increment file position.
>> +     * This needs to be here before calling writev_buffer, because
>> +     * writev_buffer is asynchronous and there could be more than one
>> +     * writev_buffer started simultaniously. Each writev_buffer should
>> +     * use its own file pos to write to. writev_buffer may write less
>> +     * than buf_index bytes but we treat this situation as an error.
>> +     * If error appeared, further file using is meaningless.
>> +     * We expect that, the most of the time the full buffer is written,
>> +     * (when buf_size == buf_index). The only case when the non-full
>> +     * buffer is written (buf_size != buf_index) is file close,
>> +     * when we need to flush the rest of the buffer content.
>> +     */
>> +    f->pos += fb->buf_index;
>
> Seems safer to add pos to QEMUFileAioTask instead, and manage global 
> f->pos
> in main coroutine, not in tasks.
I also though about that but I didn't find any benefit because with 
couroutines we shouldn't get race problems
>
>> +
>> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
>> +
>> +    /* return the just written buffer to the free list */
>> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>> +
>> +    /* check that we have written everything */
>> +    if (ret != fb->buf_index) {
>> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
>> +    }
>> +
>> +    /*
>> +     * always return 0 - don't use task error handling, relay on
>> +     * qemu file error handling
>> +     */
>> +    return 0;
>> +}
>> +
>> +static void qemu_file_switch_current_buf(QEMUFile *f)
>> +{
>> +    /*
>> +     * if the list is empty, wait until some task returns a buffer
>> +     * to the list of free buffers.
>> +     */
>> +    if (QLIST_EMPTY(&f->free_buffers)) {
>> +        aio_task_pool_wait_slot(f->pool);
>> +    }
>> +
>> +    /*
>> +     * sanity check that the list isn't empty
>> +     * if the free list was empty, we waited for a task complition,
>> +     * and the pompleted task must return a buffer to a list of free 
>> buffers
>> +     */
>> +    assert(!QLIST_EMPTY(&f->free_buffers));
>> +
>> +    /* set the current buffer for using from the free list */
>> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
>> +    reset_buf(f);
>> +
>> +    QLIST_REMOVE(f->current_buf, link);
>> +}
>> +
>> +/**
>> + *  Asynchronously flushes QEMUFile buffer
>> + *
>> + * This will flush all pending data. If data was only partially 
>> flushed, it
>> + * will set an error state. The function may return before the data 
>> actually
>> + * written.
>> + */
>> +static void flush_buffer(QEMUFile *f)
>> +{
>> +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
>> +
>> +    *t = (QEMUFileAioTask) {
>> +        .task.func = &write_task_fn,
>> +        .f = f,
>> +        .fb = f->current_buf,
>> +    };
>> +
>> +    /* aio_task_pool should free t for us */
>> +    aio_task_pool_start_task(f->pool, (AioTask *) t);
>> +
>> +    /* if no errors this will switch the buffer */
>> +    qemu_file_switch_current_buf(f);
>> +}
>> +
>>   /**
>>    * Flushes QEMUFile buffer
>>    *
>> @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
>>       if (f->shutdown) {
>>           return;
>>       }
>> +
>> +    if (f->buffered_mode) {
>> +        return;
>
> I don't think it's correct. qemu_fflush is public interface of 
> QEMUFile and it's assumed to write all in-flight data.
Yes it should, when you use sockets and you want to make sure that 
you've sent what your peer is waiting for.
But this isn't the case when you write an internal snapshot, especially 
when you explicitly ask to open qemu-file in "buffered_mode".
>
>> +    }
>> +
>>       if (fb->iovcnt > 0) {
>> +        /* this is non-buffered mode */
>>           expect = iov_size(fb->iov, fb->iovcnt);
>>           ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, 
>> f->pos,
>>                                       &local_error);
>> @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>>     void qemu_update_position(QEMUFile *f, size_t size)
>>   {
>> +    assert(!f->buffered_mode);
>
> Public interface. Why are you sure that it's not used in snapshot? It 
> is used in migration/ram.c ...
Looks like on reading... I didn't encounter that assert on writing when 
testing. Add this protection just in case, because ...
>
>>       f->pos += size;
>
> And it shouldn't be the problem for the new mode, just ass pos 
> parameter to QEMUFileAioTask,
...It could be a problem. In the new mode we want our buffers 
size-aligned, if we update pos, we need to fill the rest of the buffer 
with zeros, put the buffer for writing, get another free buffer for using.
But I didn't see qemu_update_position() using in writing path.
>
>>   }
>>   @@ -392,7 +596,18 @@ void qemu_update_position(QEMUFile *f, size_t 
>> size)
>>   int qemu_fclose(QEMUFile *f)
>>   {
>>       int ret;
>> -    qemu_fflush(f);
>> +
>> +    if (qemu_file_is_writable(f) && f->buffered_mode) {
>> +        ret = qemu_file_get_error(f);
>> +        if (!ret) {
>> +            flush_buffer(f);
>> +        }
>> +        /* wait until all tasks are done */
>> +        aio_task_pool_wait_all(f->pool);
>> +    } else {
>> +        qemu_fflush(f);
>> +    }
>
> Again, not clean thing: for buffered mode we use flush_buffer, for 
> non-buffered we use qemu_fflush.. It's not clean from function naming.
Will improve the naming
>
>> +
>>       ret = qemu_file_get_error(f);
>>         if (f->ops->close) {
>> @@ -408,16 +623,77 @@ int qemu_fclose(QEMUFile *f)
>>           ret = f->last_error;
>>       }
>>       error_free(f->last_error_obj);
>> -    g_free(f->current_buf->buf);
>> -    g_free(f->current_buf->iov);
>> -    g_free(f->current_buf->may_free);
>> -    g_free(f->current_buf);
>> +
>> +    if (f->buffered_mode) {
>> +        QEMUFileBuffer *fb, *next;
>> +        /*
>> +         * put the current back to the free buffers list
>> +         * to destroy all the buffers in one loop
>> +         */
>> +        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
>> +
>> +        /* destroy all the buffers */
>> +        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
>> +            QLIST_REMOVE(fb, link);
>> +            /* looks like qemu_vfree pairs with qemu_memalign */
>> +            qemu_vfree(fb->buf);
>> +            g_free(fb);
>> +        }
>> +        g_free(f->pool);
>> +    } else {
>> +        g_free(f->current_buf->buf);
>> +        g_free(f->current_buf->iov);
>> +        g_free(f->current_buf->may_free);
>> +        g_free(f->current_buf);
>> +    }
>> +
>>       g_free(f);
>>       trace_qemu_file_fclose();
>>       return ret;
>>   }
>>     /*
>> + * Copy an external buffer to the intenal current buffer.
>> + */
>> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
>> +                     bool may_free)
>> +{
>> +    size_t data_size = size;
>> +    const uint8_t *src_ptr = buf;
>> +
>> +    assert(f->buffered_mode);
>> +    assert(size <= INT_MAX);
>> +
>> +    while (data_size > 0) {
>> +        size_t chunk_size;
>> +
>> +        if (buf_is_full(f)) {
>> +            flush_buffer(f);
>> +            if (qemu_file_get_error(f)) {
>> +                return;
>> +            }
>> +        }
>> +
>> +        chunk_size = MIN(get_buf_free_size(f), data_size);
>> +
>> +        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
>> +
>> +        advance_buf_ptr(f, chunk_size);
>> +
>> +        src_ptr += chunk_size;
>> +        data_size -= chunk_size;
>> +        f->bytes_xfer += chunk_size;
>> +    }
>> +
>> +    if (may_free) {
>> +        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
>> +            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
>> +                         buf, size, strerror(errno));
>> +        }
>> +    }
>> +}
>> +
>> +/*
>>    * Add buf to iovec. Do flush if iovec is full.
>>    *
>>    * Return values:
>> @@ -454,6 +730,9 @@ static int add_to_iovec(QEMUFile *f, const 
>> uint8_t *buf, size_t size,
>>   static void add_buf_to_iovec(QEMUFile *f, size_t len)
>>   {
>>       QEMUFileBuffer *fb = f->current_buf;
>> +
>> +    assert(!f->buffered_mode);
>> +
>>       if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
>>           fb->buf_index += len;
>>           if (fb->buf_index == IO_BUF_SIZE) {
>> @@ -469,8 +748,12 @@ void qemu_put_buffer_async(QEMUFile *f, const 
>> uint8_t *buf, size_t size,
>>           return;
>>       }
>>   -    f->bytes_xfer += size;
>> -    add_to_iovec(f, buf, size, may_free);
>> +    if (f->buffered_mode) {
>> +        copy_buf(f, buf, size, may_free);
>> +    } else {
>> +        f->bytes_xfer += size;
>> +        add_to_iovec(f, buf, size, may_free);
>> +    }
>>   }
>>     void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>> @@ -482,6 +765,11 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t 
>> *buf, size_t size)
>>           return;
>>       }
>>   +    if (f->buffered_mode) {
>> +        copy_buf(f, buf, size, false);
>> +        return;
>> +    }
>> +
>>       while (size > 0) {
>>           l = IO_BUF_SIZE - fb->buf_index;
>>           if (l > size) {
>> @@ -506,15 +794,21 @@ void qemu_put_byte(QEMUFile *f, int v)
>>           return;
>>       }
>>   -    fb->buf[fb->buf_index] = v;
>> -    f->bytes_xfer++;
>> -    add_buf_to_iovec(f, 1);
>> +    if (f->buffered_mode) {
>> +        copy_buf(f, (const uint8_t *) &v, 1, false);
>> +    } else {
>> +        fb->buf[fb->buf_index] = v;
>> +        add_buf_to_iovec(f, 1);
>> +        f->bytes_xfer++;
>> +    }
>>   }
>>     void qemu_file_skip(QEMUFile *f, int size)
>>   {
>>       QEMUFileBuffer *fb = f->current_buf;
>>   +    assert(!f->buffered_mode);
>> +
>>       if (fb->buf_index + size <= fb->buf_size) {
>>           fb->buf_index += size;
>>       }
>> @@ -672,10 +966,14 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>>   {
>>       int64_t ret = f->pos;
>>       int i;
>> -    QEMUFileBuffer *fb = f->current_buf;
>>   -    for (i = 0; i < fb->iovcnt; i++) {
>> -        ret += fb->iov[i].iov_len;
>> +    if (f->buffered_mode) {
>> +        ret += get_buf_used_size(f);
>> +    } else {
>> +        QEMUFileBuffer *fb = f->current_buf;
>> +        for (i = 0; i < fb->iovcnt; i++) {
>> +            ret += fb->iov[i].iov_len;
>> +        }
>>       }
>>         return ret;
>> @@ -683,8 +981,12 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>>     int64_t qemu_ftell(QEMUFile *f)
>>   {
>> -    qemu_fflush(f);
>> -    return f->pos;
>> +    if (f->buffered_mode) {
>> +        return qemu_ftell_fast(f);
>> +    } else {
>> +        qemu_fflush(f);
>> +        return f->pos;
>> +    }
>>   }
>>     int qemu_file_rate_limit(QEMUFile *f)
>> @@ -803,6 +1105,8 @@ ssize_t qemu_put_compression_data(QEMUFile *f, 
>> z_stream *stream,
>>       QEMUFileBuffer *fb = f->current_buf;
>>       ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
>>   +    assert(!f->buffered_mode);
>> +
>>       if (blen < compressBound(size)) {
>>           return -1;
>>       }
>> @@ -827,6 +1131,9 @@ int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile 
>> *f_src)
>>       int len = 0;
>>       QEMUFileBuffer *fb_src = f_src->current_buf;
>>   +    assert(!f_des->buffered_mode);
>> +    assert(!f_src->buffered_mode);
>> +
>>       if (fb_src->buf_index > 0) {
>>           len = fb_src->buf_index;
>>           qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
>> diff --git a/migration/qemu-file.h b/migration/qemu-file.h
>> index a9b6d6c..08655d2 100644
>> --- a/migration/qemu-file.h
>> +++ b/migration/qemu-file.h
>> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>>   typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>>                                      Error **errp);
>>   +/*
>> + * Enables or disables the buffered mode
>> + * Existing blocking reads/writes must be woken
>> + * Returns true if the buffered mode has to be enabled,
>> + * false if it has to be disabled.
>> + */
>> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
>> +
>>   typedef struct QEMUFileOps {
>>       QEMUFileGetBufferFunc *get_buffer;
>>       QEMUFileCloseFunc *close;
>> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>>       QEMUFileWritevBufferFunc *writev_buffer;
>>       QEMURetPathFunc *get_return_path;
>>       QEMUFileShutdownFunc *shut_down;
>> +    QEMUFileEnableBufferedFunc *enable_buffered;
>>   } QEMUFileOps;
>>     typedef struct QEMUFileHooks {
>>
>
>
Denis Plotnikov April 27, 2020, 8:21 a.m. UTC | #4
On 25.04.2020 00:25, Eric Blake wrote:
> On 4/13/20 6:12 AM, Denis Plotnikov wrote:
>> The patch adds ability to qemu-file to write the data
>> asynchronously to improve the performance on writing.
>> Before, only synchronous writing was supported.
>>
>> Enabling of the asyncronous mode is managed by new
>
> asynchronous
>
>> "enabled_buffered" callback.
>
> The term "enabled_buffered" does not appear in the patch.  Did you 
> mean...
>
>>
>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
>> ---
>>   include/qemu/typedefs.h |   1 +
>>   migration/qemu-file.c   | 351 
>> +++++++++++++++++++++++++++++++++++++++++++++---
>>   migration/qemu-file.h   |   9 ++
>>   3 files changed, 339 insertions(+), 22 deletions(-)
>>
>
>> @@ -60,6 +66,22 @@ struct QEMUFile {
>>       bool shutdown;
>>       /* currently used buffer */
>>       QEMUFileBuffer *current_buf;
>> +    /*
>> +     * with buffered_mode enabled all the data copied to 512 byte
>> +     * aligned buffer, including iov data. Then the buffer is passed
>> +     * to writev_buffer callback.
>> +     */
>> +    bool buffered_mode;
>
> ..."Asynchronous mode is managed by setting the new buffered_mode 
> flag"?  ...
>
>
>> +    /* for async buffer writing */
>> +    AioTaskPool *pool;
>> +    /* the list of free buffers, currently used on is NOT there */
>
> s/on/one/
>
>> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
>> +};
>> +
>> +struct QEMUFileAioTask {
>> +    AioTask task;
>> +    QEMUFile *f;
>> +    QEMUFileBuffer *fb;
>>   };
>>     /*
>> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const 
>> QEMUFileOps *ops)
>>       f->opaque = opaque;
>>       f->ops = ops;
>>   -    f->current_buf = g_new0(QEMUFileBuffer, 1);
>> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>> +    if (f->ops->enable_buffered) {
>> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
>
> ...ah, you meant 'enable_buffered'.  But still, why do we need a 
> callback function?  Is it not sufficient to just have a bool flag?
>
>
>> +static size_t get_buf_free_size(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* buf_index can't be greated than buf_size */
>
> greater
>
>> +    assert(fb->buf_size >= fb->buf_index);
>> +    return fb->buf_size - fb->buf_index;
>> +}
>> +
>
>> +static int write_task_fn(AioTask *task)
>> +{
>
>> +    /*
>> +     * Increment file position.
>> +     * This needs to be here before calling writev_buffer, because
>> +     * writev_buffer is asynchronous and there could be more than one
>> +     * writev_buffer started simultaniously. Each writev_buffer should
>
> simultaneously
>
>> +     * use its own file pos to write to. writev_buffer may write less
>> +     * than buf_index bytes but we treat this situation as an error.
>> +     * If error appeared, further file using is meaningless.
>
> s/using/use/
>
>> +     * We expect that, the most of the time the full buffer is written,
>> +     * (when buf_size == buf_index). The only case when the non-full
>> +     * buffer is written (buf_size != buf_index) is file close,
>> +     * when we need to flush the rest of the buffer content.
>
> We expect that most of the time, the full buffer will be written 
> (buf_size == buf_index), with the exception at file close where we 
> need to flush the final partial buffer.
>
>> +     */
>> +    f->pos += fb->buf_index;
>> +
>> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
>> +
>> +    /* return the just written buffer to the free list */
>> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>> +
>> +    /* check that we have written everything */
>> +    if (ret != fb->buf_index) {
>> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
>> +    }
>> +
>> +    /*
>> +     * always return 0 - don't use task error handling, relay on
>
> rely
>
>> +     * qemu file error handling
>> +     */
>> +    return 0;
>> +}
>> +
>> +static void qemu_file_switch_current_buf(QEMUFile *f)
>> +{
>> +    /*
>> +     * if the list is empty, wait until some task returns a buffer
>> +     * to the list of free buffers.
>> +     */
>> +    if (QLIST_EMPTY(&f->free_buffers)) {
>> +        aio_task_pool_wait_slot(f->pool);
>> +    }
>> +
>> +    /*
>> +     * sanity check that the list isn't empty
>> +     * if the free list was empty, we waited for a task complition,
>
> completion
>
>> +     * and the pompleted task must return a buffer to a list of free 
>> buffers
>
> completed
>
>> +     */
>> +    assert(!QLIST_EMPTY(&f->free_buffers));
>> +
>> +    /* set the current buffer for using from the free list */
>> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
>> +    reset_buf(f);
>> +
>> +    QLIST_REMOVE(f->current_buf, link);
>> +}
>> +
>
>>     /*
>> + * Copy an external buffer to the intenal current buffer.
>
> internal
>
>> + */
>> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
>> +                     bool may_free)
>> +{
>
>> +++ b/migration/qemu-file.h
>> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>>   typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>>                                      Error **errp);
>>   +/*
>> + * Enables or disables the buffered mode
>> + * Existing blocking reads/writes must be woken
>> + * Returns true if the buffered mode has to be enabled,
>> + * false if it has to be disabled.
>> + */
>> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
>
> If this never gets called outside of initial creation of the QemuFile 
> (that is, it is not dynamic), then making it a straight flag instead 
> of a callback function is simpler.
Yes, I agree.

Thanks for reviewing and lots of grammar fixing!
>
>
>> +
>>   typedef struct QEMUFileOps {
>>       QEMUFileGetBufferFunc *get_buffer;
>>       QEMUFileCloseFunc *close;
>> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>>       QEMUFileWritevBufferFunc *writev_buffer;
>>       QEMURetPathFunc *get_return_path;
>>       QEMUFileShutdownFunc *shut_down;
>> +    QEMUFileEnableBufferedFunc *enable_buffered;
>>   } QEMUFileOps;
>>     typedef struct QEMUFileHooks {
>>
>
Vladimir Sementsov-Ogievskiy April 27, 2020, 11:04 a.m. UTC | #5
27.04.2020 11:19, Denis Plotnikov wrote:
> 
> 
> On 25.04.2020 12:10, Vladimir Sementsov-Ogievskiy wrote:
>> 13.04.2020 14:12, Denis Plotnikov wrote:
>>> The patch adds ability to qemu-file to write the data
>>> asynchronously to improve the performance on writing.
>>> Before, only synchronous writing was supported.
>>>
>>> Enabling of the asyncronous mode is managed by new
>>> "enabled_buffered" callback.
>>
>> Hmm.
>>
>> I don't like resulting architecture very much:
>>
>> 1. Function naming is not clean: how can I understand that copy_buf is for buffered mode when add_to_iovec is +- same thing for non-buffered mode?
> Yes, this is to be changed in the next patches
>>
>> Hmm actually, you just alter several significant functions of QEMUFile - open, close, put, flush. In old mode we do one thing, in a new mode - absolutely another. This looks like a driver. So may be we want to add QEMUFileDriver struct, to define these functions as callbacks, move old realizations to default driver, and add new functionality as a new driver, what do you think?
> Yes it looks like that, but on the other hand I can't imagine another driver to be added and changing the code to "the driver notation" would involve more code adding. So, should we really do that. Anyway, your suggestion looks cleaner.
>>
>> 2. Terminology: you say you add buffered mode, but actually qemu file is already work in buffered mode, so it should be clarified somehow..
> The initial implementation uses mixed implementation, it has a buffer and an iovec array. The buffer is used to store *some* parts (ecluding RAM) of a vm state + service information. Each written to the buffer is added to the iovec array as a separate entry. RAM pages aren't added to the buffer, instead they are added to the iovec array directly without coping to the buffer. This is why we almost always get the iovec array consisting of size- and pointer- unaligned iovec-s and why we have the performance issues (more detailed in 0000 of this series).
> So, I can't say that the qemu-file has "real" buffered-mode.

But if someone will come and try to understand the code, it would be difficult because of such naming. If we keep it, we should add your description as comment to that name.

>> You also add asynchronisity, but old implementation has already qemu_put_buffer_async..
> In my opinion, this function name is kind of confusing. What the function does is adding the buffer pointer to the internal iovec array without coping the buffer. It's not related to some asynchronous operation.
>> You use aio task pool, but don't say that it may be used only from coroutine.
>> May be, we'd better call it "coroutine-mode" ?
> I don't think that mentioning any implementation-specific name is a good idea. aio_task_pool is a good option to implement async operation, but it can be any other interface.
> I'd rather implicitly assert qemu_in_coroutine() when in "buffered-mode".
>>
>>
>> Also, am I correct that new mode can't be used for read? Should we document/assert it somehow? 
> It can't just for now since read-buffered mode isn't implemented. Enabling buffered-mode for reading affects nothing - qemu-file works as before.
> The same qemu-file instance can't  be opened for reading and writing at the same time. I'd add that assert on qemu-file open.

Ah, OK. I saw it, but thought somehow that enabled writing doesn't protect from reading.

>> Or may be support reading? Will switch to snapshot benefit if we implement reading in a new mode?
> I thought about that. I think it could benefit making some kind of read-ahead to a number of buffers, while the "current" buffer is used to fill a vm state.
> I didn't want to focus on the reading improvements because it's not that big problem in comparison to the writing. The qemu-file reading uses a buffer and fills  that buffer with a single io operation.

OK

> 
>>
>>>
>>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
>>> ---
>>>   include/qemu/typedefs.h |   1 +
>>>   migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
>>>   migration/qemu-file.h   |   9 ++
>>>   3 files changed, 339 insertions(+), 22 deletions(-)
>>>
>>> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
>>> index 88dce54..9b388c8 100644
>>> --- a/include/qemu/typedefs.h
>>> +++ b/include/qemu/typedefs.h
>>> @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
>>>   typedef struct QemuConsole QemuConsole;
>>>   typedef struct QEMUFile QEMUFile;
>>>   typedef struct QEMUFileBuffer QEMUFileBuffer;
>>> +typedef struct QEMUFileAioTask QEMUFileAioTask;
>>>   typedef struct QemuLockable QemuLockable;
>>>   typedef struct QemuMutex QemuMutex;
>>>   typedef struct QemuOpt QemuOpt;
>>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>>> index 285c6ef..f42f949 100644
>>> --- a/migration/qemu-file.c
>>> +++ b/migration/qemu-file.c
>>> @@ -29,19 +29,25 @@
>>>   #include "qemu-file.h"
>>>   #include "trace.h"
>>>   #include "qapi/error.h"
>>> +#include "block/aio_task.h"
>>>   -#define IO_BUF_SIZE 32768
>>> +#define IO_BUF_SIZE (1024 * 1024)
>>>   #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
>>> +#define IO_BUF_NUM 2
>>
>> Interesting, how much is it better than if we set to 1, limiting the influence of the series to alignment of written chunks?
>>> +#define IO_BUF_ALIGNMENT 512
>>>   -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
>>> +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
>>> +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
>>> +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
>>>     struct QEMUFileBuffer {
>>>       int buf_index;
>>> -    int buf_size; /* 0 when writing */
>>> +    int buf_size; /* 0 when non-buffered writing */
>>>       uint8_t *buf;
>>>       unsigned long *may_free;
>>>       struct iovec *iov;
>>>       unsigned int iovcnt;
>>> +    QLIST_ENTRY(QEMUFileBuffer) link;
>>>   };
>>>     struct QEMUFile {
>>> @@ -60,6 +66,22 @@ struct QEMUFile {
>>>       bool shutdown;
>>>       /* currently used buffer */
>>>       QEMUFileBuffer *current_buf;
>>> +    /*
>>> +     * with buffered_mode enabled all the data copied to 512 byte
>>> +     * aligned buffer, including iov data. Then the buffer is passed
>>> +     * to writev_buffer callback.
>>> +     */
>>> +    bool buffered_mode;
>>> +    /* for async buffer writing */
>>> +    AioTaskPool *pool;
>>> +    /* the list of free buffers, currently used on is NOT there */
>>> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
>>> +};
>>> +
>>> +struct QEMUFileAioTask {
>>> +    AioTask task;
>>> +    QEMUFile *f;
>>> +    QEMUFileBuffer *fb;
>>>   };
>>>     /*
>>> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
>>>       f->opaque = opaque;
>>>       f->ops = ops;
>>>   -    f->current_buf = g_new0(QEMUFileBuffer, 1);
>>> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>>> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>>> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>>> +    if (f->ops->enable_buffered) {
>>> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
>>> +    }
>>> +
>>> +    if (f->buffered_mode && qemu_file_is_writable(f)) {
>>
>> So we actually go to buffered_mode if file is writable. Then, shouldn't we otherwise set buffered_mode to false otherwise?
> buffered_mode is initialized with false

I mean, if enable_buffered() returned true but qemu_file_is_writable() returned false.. I now think it should an error.

>>
>>> +        int i;
>>> +        /*
>>> +         * in buffered_mode we don't use internal io vectors
>>> +         * and may_free bitmap, because we copy the data to be
>>> +         * written right away to the buffer
>>> +         */
>>> +        f->pool = aio_task_pool_new(IO_BUF_NUM);
>>> +
>>> +        /* allocate io buffers */
>>> +        for (i = 0; i < IO_BUF_NUM; i++) {
>>> +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
>>> +
>>> +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
>>> +            fb->buf_size = IO_BUF_SIZE;
>>> +
>>> +            /*
>>> +             * put the first buffer to the current buf and the rest
>>> +             * to the list of free buffers
>>> +             */
>>> +            if (i == 0) {
>>> +                f->current_buf = fb;
>>> +            } else {
>>> +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>>> +            }
>>> +        }
>>> +    } else {
>>> +        f->current_buf = g_new0(QEMUFileBuffer, 1);
>>> +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>>> +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>>> +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>>> +    }
>>>         return f;
>>>   }
>>> @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>>       unsigned long idx;
>>>       QEMUFileBuffer *fb = f->current_buf;
>>>   +    assert(!f->buffered_mode);
>>> +
>>>       /* Find and release all the contiguous memory ranges marked as may_free. */
>>>       idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
>>>       if (idx >= fb->iovcnt) {
>>> @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>>       bitmap_zero(fb->may_free, MAX_IOV_SIZE);
>>>   }
>>>   +static void advance_buf_ptr(QEMUFile *f, size_t size)
>>> +{
>>> +    QEMUFileBuffer *fb = f->current_buf;
>>> +    /* must not advance to 0 */
>>> +    assert(size);
>>> +    /* must not overflow buf_index (int) */
>>> +    assert(fb->buf_index + size <= INT_MAX);
>>
>> to not overflow in check: assert(fb->buf_index <= INT_MAX - size)
> good catch
>>
>>> +    /* must not exceed buf_size */
>>> +    assert(fb->buf_index + size <= fb->buf_size);
>>> +
>>> +    fb->buf_index += size;
>>> +}
>>> +
>>> +static size_t get_buf_free_size(QEMUFile *f)
>>> +{
>>> +    QEMUFileBuffer *fb = f->current_buf;
>>> +    /* buf_index can't be greated than buf_size */
>>> +    assert(fb->buf_size >= fb->buf_index);
>>> +    return fb->buf_size - fb->buf_index;
>>> +}
>>> +
>>> +static size_t get_buf_used_size(QEMUFile *f)
>>> +{
>>> +    QEMUFileBuffer *fb = f->current_buf;
>>> +    return fb->buf_index;
>>> +}
>>> +
>>> +static uint8_t *get_buf_ptr(QEMUFile *f)
>>> +{
>>> +    QEMUFileBuffer *fb = f->current_buf;
>>> +    /* protects from out of bound reading */
>>> +    assert(fb->buf_index <= IO_BUF_SIZE);
>>> +    return fb->buf + fb->buf_index;
>>> +}
>>> +
>>> +static bool buf_is_full(QEMUFile *f)
>>> +{
>>> +    return get_buf_free_size(f) == 0;
>>> +}
>>> +
>>> +static void reset_buf(QEMUFile *f)
>>> +{
>>> +    QEMUFileBuffer *fb = f->current_buf;
>>> +    fb->buf_index = 0;
>>> +}
>>> +
>>> +static int write_task_fn(AioTask *task)
>>> +{
>>> +    int ret;
>>> +    Error *local_error = NULL;
>>> +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
>>> +    QEMUFile *f = t->f;
>>> +    QEMUFileBuffer *fb = t->fb;
>>> +    uint64_t pos = f->pos;
>>> +    struct iovec v = (struct iovec) {
>>> +        .iov_base = fb->buf,
>>> +        .iov_len = fb->buf_index,
>>> +    };
>>> +
>>> +    assert(f->buffered_mode);
>>> +
>>> +    /*
>>> +     * Increment file position.
>>> +     * This needs to be here before calling writev_buffer, because
>>> +     * writev_buffer is asynchronous and there could be more than one
>>> +     * writev_buffer started simultaniously. Each writev_buffer should
>>> +     * use its own file pos to write to. writev_buffer may write less
>>> +     * than buf_index bytes but we treat this situation as an error.
>>> +     * If error appeared, further file using is meaningless.
>>> +     * We expect that, the most of the time the full buffer is written,
>>> +     * (when buf_size == buf_index). The only case when the non-full
>>> +     * buffer is written (buf_size != buf_index) is file close,
>>> +     * when we need to flush the rest of the buffer content.
>>> +     */
>>> +    f->pos += fb->buf_index;
>>
>> Seems safer to add pos to QEMUFileAioTask instead, and manage global f->pos
>> in main coroutine, not in tasks.
> I also though about that but I didn't find any benefit because with couroutines we shouldn't get race problems

Still, it would be more obvious code pattern IMHO, so that task is offset and bytes, instead of just bytes and global offset variable which we should update with care.

>>
>>> +
>>> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
>>> +
>>> +    /* return the just written buffer to the free list */
>>> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>>> +
>>> +    /* check that we have written everything */
>>> +    if (ret != fb->buf_index) {
>>> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
>>> +    }
>>> +
>>> +    /*
>>> +     * always return 0 - don't use task error handling, relay on
>>> +     * qemu file error handling
>>> +     */
>>> +    return 0;
>>> +}
>>> +
>>> +static void qemu_file_switch_current_buf(QEMUFile *f)
>>> +{
>>> +    /*
>>> +     * if the list is empty, wait until some task returns a buffer
>>> +     * to the list of free buffers.
>>> +     */
>>> +    if (QLIST_EMPTY(&f->free_buffers)) {
>>> +        aio_task_pool_wait_slot(f->pool);
>>> +    }
>>> +
>>> +    /*
>>> +     * sanity check that the list isn't empty
>>> +     * if the free list was empty, we waited for a task complition,
>>> +     * and the pompleted task must return a buffer to a list of free buffers
>>> +     */
>>> +    assert(!QLIST_EMPTY(&f->free_buffers));
>>> +
>>> +    /* set the current buffer for using from the free list */
>>> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
>>> +    reset_buf(f);
>>> +
>>> +    QLIST_REMOVE(f->current_buf, link);
>>> +}
>>> +
>>> +/**
>>> + *  Asynchronously flushes QEMUFile buffer
>>> + *
>>> + * This will flush all pending data. If data was only partially flushed, it
>>> + * will set an error state. The function may return before the data actually
>>> + * written.
>>> + */
>>> +static void flush_buffer(QEMUFile *f)
>>> +{
>>> +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
>>> +
>>> +    *t = (QEMUFileAioTask) {
>>> +        .task.func = &write_task_fn,
>>> +        .f = f,
>>> +        .fb = f->current_buf,
>>> +    };
>>> +
>>> +    /* aio_task_pool should free t for us */
>>> +    aio_task_pool_start_task(f->pool, (AioTask *) t);
>>> +
>>> +    /* if no errors this will switch the buffer */
>>> +    qemu_file_switch_current_buf(f);
>>> +}
>>> +
>>>   /**
>>>    * Flushes QEMUFile buffer
>>>    *
>>> @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
>>>       if (f->shutdown) {
>>>           return;
>>>       }
>>> +
>>> +    if (f->buffered_mode) {
>>> +        return;
>>
>> I don't think it's correct. qemu_fflush is public interface of QEMUFile and it's assumed to write all in-flight data.
> Yes it should, when you use sockets and you want to make sure that you've sent what your peer is waiting for.
> But this isn't the case when you write an internal snapshot, especially when you explicitly ask to open qemu-file in "buffered_mode".

Hmm, would be good to add a comment in the code..

>>
>>> +    }
>>> +
>>>       if (fb->iovcnt > 0) {
>>> +        /* this is non-buffered mode */
>>>           expect = iov_size(fb->iov, fb->iovcnt);
>>>           ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
>>>                                       &local_error);
>>> @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>>>     void qemu_update_position(QEMUFile *f, size_t size)
>>>   {

[..]
Dr. David Alan Gilbert April 27, 2020, 12:14 p.m. UTC | #6
* Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
> The patch adds ability to qemu-file to write the data
> asynchronously to improve the performance on writing.
> Before, only synchronous writing was supported.
> 
> Enabling of the asyncronous mode is managed by new
> "enabled_buffered" callback.

It's a bit invasive isn't it - changes a lot of functions in a lot of
places!
The multifd code separated the control headers from the data on separate
fd's - but that doesn't help your case.

Is there any chance you could do this by using the existing 'save_page'
hook (that RDMA uses).

In the cover letter you mention direct qemu_fflush calls - have we got a
few too many in some palces that you think we can clean out?

Dave

> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
> ---
>  include/qemu/typedefs.h |   1 +
>  migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
>  migration/qemu-file.h   |   9 ++
>  3 files changed, 339 insertions(+), 22 deletions(-)
> 
> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
> index 88dce54..9b388c8 100644
> --- a/include/qemu/typedefs.h
> +++ b/include/qemu/typedefs.h
> @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
>  typedef struct QemuConsole QemuConsole;
>  typedef struct QEMUFile QEMUFile;
>  typedef struct QEMUFileBuffer QEMUFileBuffer;
> +typedef struct QEMUFileAioTask QEMUFileAioTask;
>  typedef struct QemuLockable QemuLockable;
>  typedef struct QemuMutex QemuMutex;
>  typedef struct QemuOpt QemuOpt;
> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> index 285c6ef..f42f949 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -29,19 +29,25 @@
>  #include "qemu-file.h"
>  #include "trace.h"
>  #include "qapi/error.h"
> +#include "block/aio_task.h"
>  
> -#define IO_BUF_SIZE 32768
> +#define IO_BUF_SIZE (1024 * 1024)
>  #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
> +#define IO_BUF_NUM 2
> +#define IO_BUF_ALIGNMENT 512
>  
> -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
> +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
> +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
> +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
>  
>  struct QEMUFileBuffer {
>      int buf_index;
> -    int buf_size; /* 0 when writing */
> +    int buf_size; /* 0 when non-buffered writing */
>      uint8_t *buf;
>      unsigned long *may_free;
>      struct iovec *iov;
>      unsigned int iovcnt;
> +    QLIST_ENTRY(QEMUFileBuffer) link;
>  };
>  
>  struct QEMUFile {
> @@ -60,6 +66,22 @@ struct QEMUFile {
>      bool shutdown;
>      /* currently used buffer */
>      QEMUFileBuffer *current_buf;
> +    /*
> +     * with buffered_mode enabled all the data copied to 512 byte
> +     * aligned buffer, including iov data. Then the buffer is passed
> +     * to writev_buffer callback.
> +     */
> +    bool buffered_mode;
> +    /* for async buffer writing */
> +    AioTaskPool *pool;
> +    /* the list of free buffers, currently used on is NOT there */
> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
> +};
> +
> +struct QEMUFileAioTask {
> +    AioTask task;
> +    QEMUFile *f;
> +    QEMUFileBuffer *fb;
>  };
>  
>  /*
> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
>      f->opaque = opaque;
>      f->ops = ops;
>  
> -    f->current_buf = g_new0(QEMUFileBuffer, 1);
> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> +    if (f->ops->enable_buffered) {
> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
> +    }
> +
> +    if (f->buffered_mode && qemu_file_is_writable(f)) {
> +        int i;
> +        /*
> +         * in buffered_mode we don't use internal io vectors
> +         * and may_free bitmap, because we copy the data to be
> +         * written right away to the buffer
> +         */
> +        f->pool = aio_task_pool_new(IO_BUF_NUM);
> +
> +        /* allocate io buffers */
> +        for (i = 0; i < IO_BUF_NUM; i++) {
> +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
> +
> +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
> +            fb->buf_size = IO_BUF_SIZE;
> +
> +            /*
> +             * put the first buffer to the current buf and the rest
> +             * to the list of free buffers
> +             */
> +            if (i == 0) {
> +                f->current_buf = fb;
> +            } else {
> +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> +            }
> +        }
> +    } else {
> +        f->current_buf = g_new0(QEMUFileBuffer, 1);
> +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> +    }
>  
>      return f;
>  }
> @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>      unsigned long idx;
>      QEMUFileBuffer *fb = f->current_buf;
>  
> +    assert(!f->buffered_mode);
> +
>      /* Find and release all the contiguous memory ranges marked as may_free. */
>      idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
>      if (idx >= fb->iovcnt) {
> @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>      bitmap_zero(fb->may_free, MAX_IOV_SIZE);
>  }
>  
> +static void advance_buf_ptr(QEMUFile *f, size_t size)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* must not advance to 0 */
> +    assert(size);
> +    /* must not overflow buf_index (int) */
> +    assert(fb->buf_index + size <= INT_MAX);
> +    /* must not exceed buf_size */
> +    assert(fb->buf_index + size <= fb->buf_size);
> +
> +    fb->buf_index += size;
> +}
> +
> +static size_t get_buf_free_size(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* buf_index can't be greated than buf_size */
> +    assert(fb->buf_size >= fb->buf_index);
> +    return fb->buf_size - fb->buf_index;
> +}
> +
> +static size_t get_buf_used_size(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    return fb->buf_index;
> +}
> +
> +static uint8_t *get_buf_ptr(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    /* protects from out of bound reading */
> +    assert(fb->buf_index <= IO_BUF_SIZE);
> +    return fb->buf + fb->buf_index;
> +}
> +
> +static bool buf_is_full(QEMUFile *f)
> +{
> +    return get_buf_free_size(f) == 0;
> +}
> +
> +static void reset_buf(QEMUFile *f)
> +{
> +    QEMUFileBuffer *fb = f->current_buf;
> +    fb->buf_index = 0;
> +}
> +
> +static int write_task_fn(AioTask *task)
> +{
> +    int ret;
> +    Error *local_error = NULL;
> +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
> +    QEMUFile *f = t->f;
> +    QEMUFileBuffer *fb = t->fb;
> +    uint64_t pos = f->pos;
> +    struct iovec v = (struct iovec) {
> +        .iov_base = fb->buf,
> +        .iov_len = fb->buf_index,
> +    };
> +
> +    assert(f->buffered_mode);
> +
> +    /*
> +     * Increment file position.
> +     * This needs to be here before calling writev_buffer, because
> +     * writev_buffer is asynchronous and there could be more than one
> +     * writev_buffer started simultaniously. Each writev_buffer should
> +     * use its own file pos to write to. writev_buffer may write less
> +     * than buf_index bytes but we treat this situation as an error.
> +     * If error appeared, further file using is meaningless.
> +     * We expect that, the most of the time the full buffer is written,
> +     * (when buf_size == buf_index). The only case when the non-full
> +     * buffer is written (buf_size != buf_index) is file close,
> +     * when we need to flush the rest of the buffer content.
> +     */
> +    f->pos += fb->buf_index;
> +
> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
> +
> +    /* return the just written buffer to the free list */
> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> +
> +    /* check that we have written everything */
> +    if (ret != fb->buf_index) {
> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
> +    }
> +
> +    /*
> +     * always return 0 - don't use task error handling, relay on
> +     * qemu file error handling
> +     */
> +    return 0;
> +}
> +
> +static void qemu_file_switch_current_buf(QEMUFile *f)
> +{
> +    /*
> +     * if the list is empty, wait until some task returns a buffer
> +     * to the list of free buffers.
> +     */
> +    if (QLIST_EMPTY(&f->free_buffers)) {
> +        aio_task_pool_wait_slot(f->pool);
> +    }
> +
> +    /*
> +     * sanity check that the list isn't empty
> +     * if the free list was empty, we waited for a task complition,
> +     * and the pompleted task must return a buffer to a list of free buffers
> +     */
> +    assert(!QLIST_EMPTY(&f->free_buffers));
> +
> +    /* set the current buffer for using from the free list */
> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
> +    reset_buf(f);
> +
> +    QLIST_REMOVE(f->current_buf, link);
> +}
> +
> +/**
> + *  Asynchronously flushes QEMUFile buffer
> + *
> + * This will flush all pending data. If data was only partially flushed, it
> + * will set an error state. The function may return before the data actually
> + * written.
> + */
> +static void flush_buffer(QEMUFile *f)
> +{
> +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
> +
> +    *t = (QEMUFileAioTask) {
> +        .task.func = &write_task_fn,
> +        .f = f,
> +        .fb = f->current_buf,
> +    };
> +
> +    /* aio_task_pool should free t for us */
> +    aio_task_pool_start_task(f->pool, (AioTask *) t);
> +
> +    /* if no errors this will switch the buffer */
> +    qemu_file_switch_current_buf(f);
> +}
> +
>  /**
>   * Flushes QEMUFile buffer
>   *
> @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
>      if (f->shutdown) {
>          return;
>      }
> +
> +    if (f->buffered_mode) {
> +        return;
> +    }
> +
>      if (fb->iovcnt > 0) {
> +        /* this is non-buffered mode */
>          expect = iov_size(fb->iov, fb->iovcnt);
>          ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
>                                      &local_error);
> @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>  
>  void qemu_update_position(QEMUFile *f, size_t size)
>  {
> +    assert(!f->buffered_mode);
>      f->pos += size;
>  }
>  
> @@ -392,7 +596,18 @@ void qemu_update_position(QEMUFile *f, size_t size)
>  int qemu_fclose(QEMUFile *f)
>  {
>      int ret;
> -    qemu_fflush(f);
> +
> +    if (qemu_file_is_writable(f) && f->buffered_mode) {
> +        ret = qemu_file_get_error(f);
> +        if (!ret) {
> +            flush_buffer(f);
> +        }
> +        /* wait until all tasks are done */
> +        aio_task_pool_wait_all(f->pool);
> +    } else {
> +        qemu_fflush(f);
> +    }
> +
>      ret = qemu_file_get_error(f);
>  
>      if (f->ops->close) {
> @@ -408,16 +623,77 @@ int qemu_fclose(QEMUFile *f)
>          ret = f->last_error;
>      }
>      error_free(f->last_error_obj);
> -    g_free(f->current_buf->buf);
> -    g_free(f->current_buf->iov);
> -    g_free(f->current_buf->may_free);
> -    g_free(f->current_buf);
> +
> +    if (f->buffered_mode) {
> +        QEMUFileBuffer *fb, *next;
> +        /*
> +         * put the current back to the free buffers list
> +         * to destroy all the buffers in one loop
> +         */
> +        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
> +
> +        /* destroy all the buffers */
> +        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
> +            QLIST_REMOVE(fb, link);
> +            /* looks like qemu_vfree pairs with qemu_memalign */
> +            qemu_vfree(fb->buf);
> +            g_free(fb);
> +        }
> +        g_free(f->pool);
> +    } else {
> +        g_free(f->current_buf->buf);
> +        g_free(f->current_buf->iov);
> +        g_free(f->current_buf->may_free);
> +        g_free(f->current_buf);
> +    }
> +
>      g_free(f);
>      trace_qemu_file_fclose();
>      return ret;
>  }
>  
>  /*
> + * Copy an external buffer to the intenal current buffer.
> + */
> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
> +                     bool may_free)
> +{
> +    size_t data_size = size;
> +    const uint8_t *src_ptr = buf;
> +
> +    assert(f->buffered_mode);
> +    assert(size <= INT_MAX);
> +
> +    while (data_size > 0) {
> +        size_t chunk_size;
> +
> +        if (buf_is_full(f)) {
> +            flush_buffer(f);
> +            if (qemu_file_get_error(f)) {
> +                return;
> +            }
> +        }
> +
> +        chunk_size = MIN(get_buf_free_size(f), data_size);
> +
> +        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
> +
> +        advance_buf_ptr(f, chunk_size);
> +
> +        src_ptr += chunk_size;
> +        data_size -= chunk_size;
> +        f->bytes_xfer += chunk_size;
> +    }
> +
> +    if (may_free) {
> +        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
> +            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
> +                         buf, size, strerror(errno));
> +        }
> +    }
> +}
> +
> +/*
>   * Add buf to iovec. Do flush if iovec is full.
>   *
>   * Return values:
> @@ -454,6 +730,9 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
>  static void add_buf_to_iovec(QEMUFile *f, size_t len)
>  {
>      QEMUFileBuffer *fb = f->current_buf;
> +
> +    assert(!f->buffered_mode);
> +
>      if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
>          fb->buf_index += len;
>          if (fb->buf_index == IO_BUF_SIZE) {
> @@ -469,8 +748,12 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
>          return;
>      }
>  
> -    f->bytes_xfer += size;
> -    add_to_iovec(f, buf, size, may_free);
> +    if (f->buffered_mode) {
> +        copy_buf(f, buf, size, may_free);
> +    } else {
> +        f->bytes_xfer += size;
> +        add_to_iovec(f, buf, size, may_free);
> +    }
>  }
>  
>  void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
> @@ -482,6 +765,11 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>          return;
>      }
>  
> +    if (f->buffered_mode) {
> +        copy_buf(f, buf, size, false);
> +        return;
> +    }
> +
>      while (size > 0) {
>          l = IO_BUF_SIZE - fb->buf_index;
>          if (l > size) {
> @@ -506,15 +794,21 @@ void qemu_put_byte(QEMUFile *f, int v)
>          return;
>      }
>  
> -    fb->buf[fb->buf_index] = v;
> -    f->bytes_xfer++;
> -    add_buf_to_iovec(f, 1);
> +    if (f->buffered_mode) {
> +        copy_buf(f, (const uint8_t *) &v, 1, false);
> +    } else {
> +        fb->buf[fb->buf_index] = v;
> +        add_buf_to_iovec(f, 1);
> +        f->bytes_xfer++;
> +    }
>  }
>  
>  void qemu_file_skip(QEMUFile *f, int size)
>  {
>      QEMUFileBuffer *fb = f->current_buf;
>  
> +    assert(!f->buffered_mode);
> +
>      if (fb->buf_index + size <= fb->buf_size) {
>          fb->buf_index += size;
>      }
> @@ -672,10 +966,14 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>  {
>      int64_t ret = f->pos;
>      int i;
> -    QEMUFileBuffer *fb = f->current_buf;
>  
> -    for (i = 0; i < fb->iovcnt; i++) {
> -        ret += fb->iov[i].iov_len;
> +    if (f->buffered_mode) {
> +        ret += get_buf_used_size(f);
> +    } else {
> +        QEMUFileBuffer *fb = f->current_buf;
> +        for (i = 0; i < fb->iovcnt; i++) {
> +            ret += fb->iov[i].iov_len;
> +        }
>      }
>  
>      return ret;
> @@ -683,8 +981,12 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>  
>  int64_t qemu_ftell(QEMUFile *f)
>  {
> -    qemu_fflush(f);
> -    return f->pos;
> +    if (f->buffered_mode) {
> +        return qemu_ftell_fast(f);
> +    } else {
> +        qemu_fflush(f);
> +        return f->pos;
> +    }
>  }
>  
>  int qemu_file_rate_limit(QEMUFile *f)
> @@ -803,6 +1105,8 @@ ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
>      QEMUFileBuffer *fb = f->current_buf;
>      ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
>  
> +    assert(!f->buffered_mode);
> +
>      if (blen < compressBound(size)) {
>          return -1;
>      }
> @@ -827,6 +1131,9 @@ int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src)
>      int len = 0;
>      QEMUFileBuffer *fb_src = f_src->current_buf;
>  
> +    assert(!f_des->buffered_mode);
> +    assert(!f_src->buffered_mode);
> +
>      if (fb_src->buf_index > 0) {
>          len = fb_src->buf_index;
>          qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
> diff --git a/migration/qemu-file.h b/migration/qemu-file.h
> index a9b6d6c..08655d2 100644
> --- a/migration/qemu-file.h
> +++ b/migration/qemu-file.h
> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>  typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>                                     Error **errp);
>  
> +/*
> + * Enables or disables the buffered mode
> + * Existing blocking reads/writes must be woken
> + * Returns true if the buffered mode has to be enabled,
> + * false if it has to be disabled.
> + */
> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
> +
>  typedef struct QEMUFileOps {
>      QEMUFileGetBufferFunc *get_buffer;
>      QEMUFileCloseFunc *close;
> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>      QEMUFileWritevBufferFunc *writev_buffer;
>      QEMURetPathFunc *get_return_path;
>      QEMUFileShutdownFunc *shut_down;
> +    QEMUFileEnableBufferedFunc *enable_buffered;
>  } QEMUFileOps;
>  
>  typedef struct QEMUFileHooks {
> -- 
> 1.8.3.1
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Denis Plotnikov April 28, 2020, 8:06 a.m. UTC | #7
On 27.04.2020 15:14, Dr. David Alan Gilbert wrote:
> * Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
>> The patch adds ability to qemu-file to write the data
>> asynchronously to improve the performance on writing.
>> Before, only synchronous writing was supported.
>>
>> Enabling of the asyncronous mode is managed by new
>> "enabled_buffered" callback.
> It's a bit invasive isn't it - changes a lot of functions in a lot of
> places!

If you mean changing the qemu-file code - yes, it is.

If you mean changing the qemu-file usage in the code - no.
The only place to change is the snapshot code when the buffered mode is 
enabled with a callback.
The change is in 03 patch of the series.

> The multifd code separated the control headers from the data on separate
> fd's - but that doesn't help your case.

yes, that doesn't help
>
> Is there any chance you could do this by using the existing 'save_page'
> hook (that RDMA uses).

I don't think so. My goal is to improve writing performance of
the internal snapshot to qcow2 image. The snapshot is saved in qcow2 as
continuous stream placed in the end of address space.
To achieve the best writing speed I need a size and base-aligned buffer
containing the vm state (with ram) which looks like that (related to ram):

... | ram page header | ram page | ram page header | ram page | ... and 
so on

to store the buffer in qcow2 with a single operation.

'save_page' would allow me not to store 'ram page' in the qemu-file 
internal structures,
and write my own ram page storing logic. I think that wouldn't help me a 
lot because:
1. I need a page with the ram page header
2. I want to reduce the number of io operations
3. I want to save other parts of vm state as fast as possible

May be I can't see the better way of using 'save page' callback.
Could you suggest anything?

Denis
> In the cover letter you mention direct qemu_fflush calls - have we got a
> few too many in some palces that you think we can clean out?

I'm not sure that some of them are excessive. To the best of my knowlege,
qemu-file is used for the source-destination communication on migration
and removing some qemu_fflush-es may break communication logic.

Snapshot is just a special case (if not the only) when we know that we 
can do buffered (cached)
writings. Do you know any other cases when the buffered (cached) mode 
could be useful?

>
> Dave
>
>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
>> ---
>>   include/qemu/typedefs.h |   1 +
>>   migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
>>   migration/qemu-file.h   |   9 ++
>>   3 files changed, 339 insertions(+), 22 deletions(-)
>>
>> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
>> index 88dce54..9b388c8 100644
>> --- a/include/qemu/typedefs.h
>> +++ b/include/qemu/typedefs.h
>> @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
>>   typedef struct QemuConsole QemuConsole;
>>   typedef struct QEMUFile QEMUFile;
>>   typedef struct QEMUFileBuffer QEMUFileBuffer;
>> +typedef struct QEMUFileAioTask QEMUFileAioTask;
>>   typedef struct QemuLockable QemuLockable;
>>   typedef struct QemuMutex QemuMutex;
>>   typedef struct QemuOpt QemuOpt;
>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>> index 285c6ef..f42f949 100644
>> --- a/migration/qemu-file.c
>> +++ b/migration/qemu-file.c
>> @@ -29,19 +29,25 @@
>>   #include "qemu-file.h"
>>   #include "trace.h"
>>   #include "qapi/error.h"
>> +#include "block/aio_task.h"
>>   
>> -#define IO_BUF_SIZE 32768
>> +#define IO_BUF_SIZE (1024 * 1024)
>>   #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
>> +#define IO_BUF_NUM 2
>> +#define IO_BUF_ALIGNMENT 512
>>   
>> -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
>> +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
>> +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
>> +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
>>   
>>   struct QEMUFileBuffer {
>>       int buf_index;
>> -    int buf_size; /* 0 when writing */
>> +    int buf_size; /* 0 when non-buffered writing */
>>       uint8_t *buf;
>>       unsigned long *may_free;
>>       struct iovec *iov;
>>       unsigned int iovcnt;
>> +    QLIST_ENTRY(QEMUFileBuffer) link;
>>   };
>>   
>>   struct QEMUFile {
>> @@ -60,6 +66,22 @@ struct QEMUFile {
>>       bool shutdown;
>>       /* currently used buffer */
>>       QEMUFileBuffer *current_buf;
>> +    /*
>> +     * with buffered_mode enabled all the data copied to 512 byte
>> +     * aligned buffer, including iov data. Then the buffer is passed
>> +     * to writev_buffer callback.
>> +     */
>> +    bool buffered_mode;
>> +    /* for async buffer writing */
>> +    AioTaskPool *pool;
>> +    /* the list of free buffers, currently used on is NOT there */
>> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
>> +};
>> +
>> +struct QEMUFileAioTask {
>> +    AioTask task;
>> +    QEMUFile *f;
>> +    QEMUFileBuffer *fb;
>>   };
>>   
>>   /*
>> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
>>       f->opaque = opaque;
>>       f->ops = ops;
>>   
>> -    f->current_buf = g_new0(QEMUFileBuffer, 1);
>> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>> +    if (f->ops->enable_buffered) {
>> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
>> +    }
>> +
>> +    if (f->buffered_mode && qemu_file_is_writable(f)) {
>> +        int i;
>> +        /*
>> +         * in buffered_mode we don't use internal io vectors
>> +         * and may_free bitmap, because we copy the data to be
>> +         * written right away to the buffer
>> +         */
>> +        f->pool = aio_task_pool_new(IO_BUF_NUM);
>> +
>> +        /* allocate io buffers */
>> +        for (i = 0; i < IO_BUF_NUM; i++) {
>> +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
>> +
>> +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
>> +            fb->buf_size = IO_BUF_SIZE;
>> +
>> +            /*
>> +             * put the first buffer to the current buf and the rest
>> +             * to the list of free buffers
>> +             */
>> +            if (i == 0) {
>> +                f->current_buf = fb;
>> +            } else {
>> +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>> +            }
>> +        }
>> +    } else {
>> +        f->current_buf = g_new0(QEMUFileBuffer, 1);
>> +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>> +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>> +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>> +    }
>>   
>>       return f;
>>   }
>> @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>       unsigned long idx;
>>       QEMUFileBuffer *fb = f->current_buf;
>>   
>> +    assert(!f->buffered_mode);
>> +
>>       /* Find and release all the contiguous memory ranges marked as may_free. */
>>       idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
>>       if (idx >= fb->iovcnt) {
>> @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>       bitmap_zero(fb->may_free, MAX_IOV_SIZE);
>>   }
>>   
>> +static void advance_buf_ptr(QEMUFile *f, size_t size)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* must not advance to 0 */
>> +    assert(size);
>> +    /* must not overflow buf_index (int) */
>> +    assert(fb->buf_index + size <= INT_MAX);
>> +    /* must not exceed buf_size */
>> +    assert(fb->buf_index + size <= fb->buf_size);
>> +
>> +    fb->buf_index += size;
>> +}
>> +
>> +static size_t get_buf_free_size(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* buf_index can't be greated than buf_size */
>> +    assert(fb->buf_size >= fb->buf_index);
>> +    return fb->buf_size - fb->buf_index;
>> +}
>> +
>> +static size_t get_buf_used_size(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    return fb->buf_index;
>> +}
>> +
>> +static uint8_t *get_buf_ptr(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    /* protects from out of bound reading */
>> +    assert(fb->buf_index <= IO_BUF_SIZE);
>> +    return fb->buf + fb->buf_index;
>> +}
>> +
>> +static bool buf_is_full(QEMUFile *f)
>> +{
>> +    return get_buf_free_size(f) == 0;
>> +}
>> +
>> +static void reset_buf(QEMUFile *f)
>> +{
>> +    QEMUFileBuffer *fb = f->current_buf;
>> +    fb->buf_index = 0;
>> +}
>> +
>> +static int write_task_fn(AioTask *task)
>> +{
>> +    int ret;
>> +    Error *local_error = NULL;
>> +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
>> +    QEMUFile *f = t->f;
>> +    QEMUFileBuffer *fb = t->fb;
>> +    uint64_t pos = f->pos;
>> +    struct iovec v = (struct iovec) {
>> +        .iov_base = fb->buf,
>> +        .iov_len = fb->buf_index,
>> +    };
>> +
>> +    assert(f->buffered_mode);
>> +
>> +    /*
>> +     * Increment file position.
>> +     * This needs to be here before calling writev_buffer, because
>> +     * writev_buffer is asynchronous and there could be more than one
>> +     * writev_buffer started simultaniously. Each writev_buffer should
>> +     * use its own file pos to write to. writev_buffer may write less
>> +     * than buf_index bytes but we treat this situation as an error.
>> +     * If error appeared, further file using is meaningless.
>> +     * We expect that, the most of the time the full buffer is written,
>> +     * (when buf_size == buf_index). The only case when the non-full
>> +     * buffer is written (buf_size != buf_index) is file close,
>> +     * when we need to flush the rest of the buffer content.
>> +     */
>> +    f->pos += fb->buf_index;
>> +
>> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
>> +
>> +    /* return the just written buffer to the free list */
>> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>> +
>> +    /* check that we have written everything */
>> +    if (ret != fb->buf_index) {
>> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
>> +    }
>> +
>> +    /*
>> +     * always return 0 - don't use task error handling, relay on
>> +     * qemu file error handling
>> +     */
>> +    return 0;
>> +}
>> +
>> +static void qemu_file_switch_current_buf(QEMUFile *f)
>> +{
>> +    /*
>> +     * if the list is empty, wait until some task returns a buffer
>> +     * to the list of free buffers.
>> +     */
>> +    if (QLIST_EMPTY(&f->free_buffers)) {
>> +        aio_task_pool_wait_slot(f->pool);
>> +    }
>> +
>> +    /*
>> +     * sanity check that the list isn't empty
>> +     * if the free list was empty, we waited for a task complition,
>> +     * and the pompleted task must return a buffer to a list of free buffers
>> +     */
>> +    assert(!QLIST_EMPTY(&f->free_buffers));
>> +
>> +    /* set the current buffer for using from the free list */
>> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
>> +    reset_buf(f);
>> +
>> +    QLIST_REMOVE(f->current_buf, link);
>> +}
>> +
>> +/**
>> + *  Asynchronously flushes QEMUFile buffer
>> + *
>> + * This will flush all pending data. If data was only partially flushed, it
>> + * will set an error state. The function may return before the data actually
>> + * written.
>> + */
>> +static void flush_buffer(QEMUFile *f)
>> +{
>> +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
>> +
>> +    *t = (QEMUFileAioTask) {
>> +        .task.func = &write_task_fn,
>> +        .f = f,
>> +        .fb = f->current_buf,
>> +    };
>> +
>> +    /* aio_task_pool should free t for us */
>> +    aio_task_pool_start_task(f->pool, (AioTask *) t);
>> +
>> +    /* if no errors this will switch the buffer */
>> +    qemu_file_switch_current_buf(f);
>> +}
>> +
>>   /**
>>    * Flushes QEMUFile buffer
>>    *
>> @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
>>       if (f->shutdown) {
>>           return;
>>       }
>> +
>> +    if (f->buffered_mode) {
>> +        return;
>> +    }
>> +
>>       if (fb->iovcnt > 0) {
>> +        /* this is non-buffered mode */
>>           expect = iov_size(fb->iov, fb->iovcnt);
>>           ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
>>                                       &local_error);
>> @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>>   
>>   void qemu_update_position(QEMUFile *f, size_t size)
>>   {
>> +    assert(!f->buffered_mode);
>>       f->pos += size;
>>   }
>>   
>> @@ -392,7 +596,18 @@ void qemu_update_position(QEMUFile *f, size_t size)
>>   int qemu_fclose(QEMUFile *f)
>>   {
>>       int ret;
>> -    qemu_fflush(f);
>> +
>> +    if (qemu_file_is_writable(f) && f->buffered_mode) {
>> +        ret = qemu_file_get_error(f);
>> +        if (!ret) {
>> +            flush_buffer(f);
>> +        }
>> +        /* wait until all tasks are done */
>> +        aio_task_pool_wait_all(f->pool);
>> +    } else {
>> +        qemu_fflush(f);
>> +    }
>> +
>>       ret = qemu_file_get_error(f);
>>   
>>       if (f->ops->close) {
>> @@ -408,16 +623,77 @@ int qemu_fclose(QEMUFile *f)
>>           ret = f->last_error;
>>       }
>>       error_free(f->last_error_obj);
>> -    g_free(f->current_buf->buf);
>> -    g_free(f->current_buf->iov);
>> -    g_free(f->current_buf->may_free);
>> -    g_free(f->current_buf);
>> +
>> +    if (f->buffered_mode) {
>> +        QEMUFileBuffer *fb, *next;
>> +        /*
>> +         * put the current back to the free buffers list
>> +         * to destroy all the buffers in one loop
>> +         */
>> +        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
>> +
>> +        /* destroy all the buffers */
>> +        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
>> +            QLIST_REMOVE(fb, link);
>> +            /* looks like qemu_vfree pairs with qemu_memalign */
>> +            qemu_vfree(fb->buf);
>> +            g_free(fb);
>> +        }
>> +        g_free(f->pool);
>> +    } else {
>> +        g_free(f->current_buf->buf);
>> +        g_free(f->current_buf->iov);
>> +        g_free(f->current_buf->may_free);
>> +        g_free(f->current_buf);
>> +    }
>> +
>>       g_free(f);
>>       trace_qemu_file_fclose();
>>       return ret;
>>   }
>>   
>>   /*
>> + * Copy an external buffer to the intenal current buffer.
>> + */
>> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
>> +                     bool may_free)
>> +{
>> +    size_t data_size = size;
>> +    const uint8_t *src_ptr = buf;
>> +
>> +    assert(f->buffered_mode);
>> +    assert(size <= INT_MAX);
>> +
>> +    while (data_size > 0) {
>> +        size_t chunk_size;
>> +
>> +        if (buf_is_full(f)) {
>> +            flush_buffer(f);
>> +            if (qemu_file_get_error(f)) {
>> +                return;
>> +            }
>> +        }
>> +
>> +        chunk_size = MIN(get_buf_free_size(f), data_size);
>> +
>> +        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
>> +
>> +        advance_buf_ptr(f, chunk_size);
>> +
>> +        src_ptr += chunk_size;
>> +        data_size -= chunk_size;
>> +        f->bytes_xfer += chunk_size;
>> +    }
>> +
>> +    if (may_free) {
>> +        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
>> +            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
>> +                         buf, size, strerror(errno));
>> +        }
>> +    }
>> +}
>> +
>> +/*
>>    * Add buf to iovec. Do flush if iovec is full.
>>    *
>>    * Return values:
>> @@ -454,6 +730,9 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
>>   static void add_buf_to_iovec(QEMUFile *f, size_t len)
>>   {
>>       QEMUFileBuffer *fb = f->current_buf;
>> +
>> +    assert(!f->buffered_mode);
>> +
>>       if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
>>           fb->buf_index += len;
>>           if (fb->buf_index == IO_BUF_SIZE) {
>> @@ -469,8 +748,12 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
>>           return;
>>       }
>>   
>> -    f->bytes_xfer += size;
>> -    add_to_iovec(f, buf, size, may_free);
>> +    if (f->buffered_mode) {
>> +        copy_buf(f, buf, size, may_free);
>> +    } else {
>> +        f->bytes_xfer += size;
>> +        add_to_iovec(f, buf, size, may_free);
>> +    }
>>   }
>>   
>>   void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>> @@ -482,6 +765,11 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>>           return;
>>       }
>>   
>> +    if (f->buffered_mode) {
>> +        copy_buf(f, buf, size, false);
>> +        return;
>> +    }
>> +
>>       while (size > 0) {
>>           l = IO_BUF_SIZE - fb->buf_index;
>>           if (l > size) {
>> @@ -506,15 +794,21 @@ void qemu_put_byte(QEMUFile *f, int v)
>>           return;
>>       }
>>   
>> -    fb->buf[fb->buf_index] = v;
>> -    f->bytes_xfer++;
>> -    add_buf_to_iovec(f, 1);
>> +    if (f->buffered_mode) {
>> +        copy_buf(f, (const uint8_t *) &v, 1, false);
>> +    } else {
>> +        fb->buf[fb->buf_index] = v;
>> +        add_buf_to_iovec(f, 1);
>> +        f->bytes_xfer++;
>> +    }
>>   }
>>   
>>   void qemu_file_skip(QEMUFile *f, int size)
>>   {
>>       QEMUFileBuffer *fb = f->current_buf;
>>   
>> +    assert(!f->buffered_mode);
>> +
>>       if (fb->buf_index + size <= fb->buf_size) {
>>           fb->buf_index += size;
>>       }
>> @@ -672,10 +966,14 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>>   {
>>       int64_t ret = f->pos;
>>       int i;
>> -    QEMUFileBuffer *fb = f->current_buf;
>>   
>> -    for (i = 0; i < fb->iovcnt; i++) {
>> -        ret += fb->iov[i].iov_len;
>> +    if (f->buffered_mode) {
>> +        ret += get_buf_used_size(f);
>> +    } else {
>> +        QEMUFileBuffer *fb = f->current_buf;
>> +        for (i = 0; i < fb->iovcnt; i++) {
>> +            ret += fb->iov[i].iov_len;
>> +        }
>>       }
>>   
>>       return ret;
>> @@ -683,8 +981,12 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>>   
>>   int64_t qemu_ftell(QEMUFile *f)
>>   {
>> -    qemu_fflush(f);
>> -    return f->pos;
>> +    if (f->buffered_mode) {
>> +        return qemu_ftell_fast(f);
>> +    } else {
>> +        qemu_fflush(f);
>> +        return f->pos;
>> +    }
>>   }
>>   
>>   int qemu_file_rate_limit(QEMUFile *f)
>> @@ -803,6 +1105,8 @@ ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
>>       QEMUFileBuffer *fb = f->current_buf;
>>       ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
>>   
>> +    assert(!f->buffered_mode);
>> +
>>       if (blen < compressBound(size)) {
>>           return -1;
>>       }
>> @@ -827,6 +1131,9 @@ int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src)
>>       int len = 0;
>>       QEMUFileBuffer *fb_src = f_src->current_buf;
>>   
>> +    assert(!f_des->buffered_mode);
>> +    assert(!f_src->buffered_mode);
>> +
>>       if (fb_src->buf_index > 0) {
>>           len = fb_src->buf_index;
>>           qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
>> diff --git a/migration/qemu-file.h b/migration/qemu-file.h
>> index a9b6d6c..08655d2 100644
>> --- a/migration/qemu-file.h
>> +++ b/migration/qemu-file.h
>> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>>   typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>>                                      Error **errp);
>>   
>> +/*
>> + * Enables or disables the buffered mode
>> + * Existing blocking reads/writes must be woken
>> + * Returns true if the buffered mode has to be enabled,
>> + * false if it has to be disabled.
>> + */
>> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
>> +
>>   typedef struct QEMUFileOps {
>>       QEMUFileGetBufferFunc *get_buffer;
>>       QEMUFileCloseFunc *close;
>> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>>       QEMUFileWritevBufferFunc *writev_buffer;
>>       QEMURetPathFunc *get_return_path;
>>       QEMUFileShutdownFunc *shut_down;
>> +    QEMUFileEnableBufferedFunc *enable_buffered;
>>   } QEMUFileOps;
>>   
>>   typedef struct QEMUFileHooks {
>> -- 
>> 1.8.3.1
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Dr. David Alan Gilbert April 28, 2020, 5:54 p.m. UTC | #8
* Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
> 
> 
> On 27.04.2020 15:14, Dr. David Alan Gilbert wrote:
> > * Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
> > > The patch adds ability to qemu-file to write the data
> > > asynchronously to improve the performance on writing.
> > > Before, only synchronous writing was supported.
> > > 
> > > Enabling of the asyncronous mode is managed by new
> > > "enabled_buffered" callback.
> > It's a bit invasive isn't it - changes a lot of functions in a lot of
> > places!
> 
> If you mean changing the qemu-file code - yes, it is.

Yeh that's what I worry about; qemu-file is pretty complex as it is.
Especially when it then passes it to the channel code etc

> If you mean changing the qemu-file usage in the code - no.
> The only place to change is the snapshot code when the buffered mode is
> enabled with a callback.
> The change is in 03 patch of the series.

That's fine - that's easy.

> > The multifd code separated the control headers from the data on separate
> > fd's - but that doesn't help your case.
> 
> yes, that doesn't help
> > 
> > Is there any chance you could do this by using the existing 'save_page'
> > hook (that RDMA uses).
> 
> I don't think so. My goal is to improve writing performance of
> the internal snapshot to qcow2 image. The snapshot is saved in qcow2 as
> continuous stream placed in the end of address space.
> To achieve the best writing speed I need a size and base-aligned buffer
> containing the vm state (with ram) which looks like that (related to ram):
> 
> ... | ram page header | ram page | ram page header | ram page | ... and so
> on
> 
> to store the buffer in qcow2 with a single operation.
> 
> 'save_page' would allow me not to store 'ram page' in the qemu-file internal
> structures,
> and write my own ram page storing logic. I think that wouldn't help me a lot
> because:
> 1. I need a page with the ram page header
> 2. I want to reduce the number of io operations
> 3. I want to save other parts of vm state as fast as possible
> 
> May be I can't see the better way of using 'save page' callback.
> Could you suggest anything?

I guess it depends if we care about keeping the format of the snapshot
the same here;  if we were open to changing it, then we could use
the save_page hook to delay the writes, so we'd have a pile of headers
followed by a pile of pages.

> Denis
> > In the cover letter you mention direct qemu_fflush calls - have we got a
> > few too many in some palces that you think we can clean out?
> 
> I'm not sure that some of them are excessive. To the best of my knowlege,
> qemu-file is used for the source-destination communication on migration
> and removing some qemu_fflush-es may break communication logic.

I can't see any obvious places where it's called during the ram
migration; can you try and give me a hint to where you're seeing it ?

> Snapshot is just a special case (if not the only) when we know that we can
> do buffered (cached)
> writings. Do you know any other cases when the buffered (cached) mode could
> be useful?

The RDMA code does it because it's really not good at small transfers,
but maybe generally it would be a good idea to do larger writes if
possible - something that multifd manages.

Dave

> 
> > 
> > Dave
> > 
> > > Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
> > > ---
> > >   include/qemu/typedefs.h |   1 +
> > >   migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
> > >   migration/qemu-file.h   |   9 ++
> > >   3 files changed, 339 insertions(+), 22 deletions(-)
> > > 
> > > diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
> > > index 88dce54..9b388c8 100644
> > > --- a/include/qemu/typedefs.h
> > > +++ b/include/qemu/typedefs.h
> > > @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
> > >   typedef struct QemuConsole QemuConsole;
> > >   typedef struct QEMUFile QEMUFile;
> > >   typedef struct QEMUFileBuffer QEMUFileBuffer;
> > > +typedef struct QEMUFileAioTask QEMUFileAioTask;
> > >   typedef struct QemuLockable QemuLockable;
> > >   typedef struct QemuMutex QemuMutex;
> > >   typedef struct QemuOpt QemuOpt;
> > > diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> > > index 285c6ef..f42f949 100644
> > > --- a/migration/qemu-file.c
> > > +++ b/migration/qemu-file.c
> > > @@ -29,19 +29,25 @@
> > >   #include "qemu-file.h"
> > >   #include "trace.h"
> > >   #include "qapi/error.h"
> > > +#include "block/aio_task.h"
> > > -#define IO_BUF_SIZE 32768
> > > +#define IO_BUF_SIZE (1024 * 1024)
> > >   #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
> > > +#define IO_BUF_NUM 2
> > > +#define IO_BUF_ALIGNMENT 512
> > > -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
> > > +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
> > > +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
> > > +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
> > >   struct QEMUFileBuffer {
> > >       int buf_index;
> > > -    int buf_size; /* 0 when writing */
> > > +    int buf_size; /* 0 when non-buffered writing */
> > >       uint8_t *buf;
> > >       unsigned long *may_free;
> > >       struct iovec *iov;
> > >       unsigned int iovcnt;
> > > +    QLIST_ENTRY(QEMUFileBuffer) link;
> > >   };
> > >   struct QEMUFile {
> > > @@ -60,6 +66,22 @@ struct QEMUFile {
> > >       bool shutdown;
> > >       /* currently used buffer */
> > >       QEMUFileBuffer *current_buf;
> > > +    /*
> > > +     * with buffered_mode enabled all the data copied to 512 byte
> > > +     * aligned buffer, including iov data. Then the buffer is passed
> > > +     * to writev_buffer callback.
> > > +     */
> > > +    bool buffered_mode;
> > > +    /* for async buffer writing */
> > > +    AioTaskPool *pool;
> > > +    /* the list of free buffers, currently used on is NOT there */
> > > +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
> > > +};
> > > +
> > > +struct QEMUFileAioTask {
> > > +    AioTask task;
> > > +    QEMUFile *f;
> > > +    QEMUFileBuffer *fb;
> > >   };
> > >   /*
> > > @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
> > >       f->opaque = opaque;
> > >       f->ops = ops;
> > > -    f->current_buf = g_new0(QEMUFileBuffer, 1);
> > > -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> > > -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> > > -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> > > +    if (f->ops->enable_buffered) {
> > > +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
> > > +    }
> > > +
> > > +    if (f->buffered_mode && qemu_file_is_writable(f)) {
> > > +        int i;
> > > +        /*
> > > +         * in buffered_mode we don't use internal io vectors
> > > +         * and may_free bitmap, because we copy the data to be
> > > +         * written right away to the buffer
> > > +         */
> > > +        f->pool = aio_task_pool_new(IO_BUF_NUM);
> > > +
> > > +        /* allocate io buffers */
> > > +        for (i = 0; i < IO_BUF_NUM; i++) {
> > > +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
> > > +
> > > +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
> > > +            fb->buf_size = IO_BUF_SIZE;
> > > +
> > > +            /*
> > > +             * put the first buffer to the current buf and the rest
> > > +             * to the list of free buffers
> > > +             */
> > > +            if (i == 0) {
> > > +                f->current_buf = fb;
> > > +            } else {
> > > +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> > > +            }
> > > +        }
> > > +    } else {
> > > +        f->current_buf = g_new0(QEMUFileBuffer, 1);
> > > +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
> > > +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
> > > +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
> > > +    }
> > >       return f;
> > >   }
> > > @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
> > >       unsigned long idx;
> > >       QEMUFileBuffer *fb = f->current_buf;
> > > +    assert(!f->buffered_mode);
> > > +
> > >       /* Find and release all the contiguous memory ranges marked as may_free. */
> > >       idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
> > >       if (idx >= fb->iovcnt) {
> > > @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
> > >       bitmap_zero(fb->may_free, MAX_IOV_SIZE);
> > >   }
> > > +static void advance_buf_ptr(QEMUFile *f, size_t size)
> > > +{
> > > +    QEMUFileBuffer *fb = f->current_buf;
> > > +    /* must not advance to 0 */
> > > +    assert(size);
> > > +    /* must not overflow buf_index (int) */
> > > +    assert(fb->buf_index + size <= INT_MAX);
> > > +    /* must not exceed buf_size */
> > > +    assert(fb->buf_index + size <= fb->buf_size);
> > > +
> > > +    fb->buf_index += size;
> > > +}
> > > +
> > > +static size_t get_buf_free_size(QEMUFile *f)
> > > +{
> > > +    QEMUFileBuffer *fb = f->current_buf;
> > > +    /* buf_index can't be greated than buf_size */
> > > +    assert(fb->buf_size >= fb->buf_index);
> > > +    return fb->buf_size - fb->buf_index;
> > > +}
> > > +
> > > +static size_t get_buf_used_size(QEMUFile *f)
> > > +{
> > > +    QEMUFileBuffer *fb = f->current_buf;
> > > +    return fb->buf_index;
> > > +}
> > > +
> > > +static uint8_t *get_buf_ptr(QEMUFile *f)
> > > +{
> > > +    QEMUFileBuffer *fb = f->current_buf;
> > > +    /* protects from out of bound reading */
> > > +    assert(fb->buf_index <= IO_BUF_SIZE);
> > > +    return fb->buf + fb->buf_index;
> > > +}
> > > +
> > > +static bool buf_is_full(QEMUFile *f)
> > > +{
> > > +    return get_buf_free_size(f) == 0;
> > > +}
> > > +
> > > +static void reset_buf(QEMUFile *f)
> > > +{
> > > +    QEMUFileBuffer *fb = f->current_buf;
> > > +    fb->buf_index = 0;
> > > +}
> > > +
> > > +static int write_task_fn(AioTask *task)
> > > +{
> > > +    int ret;
> > > +    Error *local_error = NULL;
> > > +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
> > > +    QEMUFile *f = t->f;
> > > +    QEMUFileBuffer *fb = t->fb;
> > > +    uint64_t pos = f->pos;
> > > +    struct iovec v = (struct iovec) {
> > > +        .iov_base = fb->buf,
> > > +        .iov_len = fb->buf_index,
> > > +    };
> > > +
> > > +    assert(f->buffered_mode);
> > > +
> > > +    /*
> > > +     * Increment file position.
> > > +     * This needs to be here before calling writev_buffer, because
> > > +     * writev_buffer is asynchronous and there could be more than one
> > > +     * writev_buffer started simultaniously. Each writev_buffer should
> > > +     * use its own file pos to write to. writev_buffer may write less
> > > +     * than buf_index bytes but we treat this situation as an error.
> > > +     * If error appeared, further file using is meaningless.
> > > +     * We expect that, the most of the time the full buffer is written,
> > > +     * (when buf_size == buf_index). The only case when the non-full
> > > +     * buffer is written (buf_size != buf_index) is file close,
> > > +     * when we need to flush the rest of the buffer content.
> > > +     */
> > > +    f->pos += fb->buf_index;
> > > +
> > > +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
> > > +
> > > +    /* return the just written buffer to the free list */
> > > +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
> > > +
> > > +    /* check that we have written everything */
> > > +    if (ret != fb->buf_index) {
> > > +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
> > > +    }
> > > +
> > > +    /*
> > > +     * always return 0 - don't use task error handling, relay on
> > > +     * qemu file error handling
> > > +     */
> > > +    return 0;
> > > +}
> > > +
> > > +static void qemu_file_switch_current_buf(QEMUFile *f)
> > > +{
> > > +    /*
> > > +     * if the list is empty, wait until some task returns a buffer
> > > +     * to the list of free buffers.
> > > +     */
> > > +    if (QLIST_EMPTY(&f->free_buffers)) {
> > > +        aio_task_pool_wait_slot(f->pool);
> > > +    }
> > > +
> > > +    /*
> > > +     * sanity check that the list isn't empty
> > > +     * if the free list was empty, we waited for a task complition,
> > > +     * and the pompleted task must return a buffer to a list of free buffers
> > > +     */
> > > +    assert(!QLIST_EMPTY(&f->free_buffers));
> > > +
> > > +    /* set the current buffer for using from the free list */
> > > +    f->current_buf = QLIST_FIRST(&f->free_buffers);
> > > +    reset_buf(f);
> > > +
> > > +    QLIST_REMOVE(f->current_buf, link);
> > > +}
> > > +
> > > +/**
> > > + *  Asynchronously flushes QEMUFile buffer
> > > + *
> > > + * This will flush all pending data. If data was only partially flushed, it
> > > + * will set an error state. The function may return before the data actually
> > > + * written.
> > > + */
> > > +static void flush_buffer(QEMUFile *f)
> > > +{
> > > +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
> > > +
> > > +    *t = (QEMUFileAioTask) {
> > > +        .task.func = &write_task_fn,
> > > +        .f = f,
> > > +        .fb = f->current_buf,
> > > +    };
> > > +
> > > +    /* aio_task_pool should free t for us */
> > > +    aio_task_pool_start_task(f->pool, (AioTask *) t);
> > > +
> > > +    /* if no errors this will switch the buffer */
> > > +    qemu_file_switch_current_buf(f);
> > > +}
> > > +
> > >   /**
> > >    * Flushes QEMUFile buffer
> > >    *
> > > @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
> > >       if (f->shutdown) {
> > >           return;
> > >       }
> > > +
> > > +    if (f->buffered_mode) {
> > > +        return;
> > > +    }
> > > +
> > >       if (fb->iovcnt > 0) {
> > > +        /* this is non-buffered mode */
> > >           expect = iov_size(fb->iov, fb->iovcnt);
> > >           ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
> > >                                       &local_error);
> > > @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
> > >   void qemu_update_position(QEMUFile *f, size_t size)
> > >   {
> > > +    assert(!f->buffered_mode);
> > >       f->pos += size;
> > >   }
> > > @@ -392,7 +596,18 @@ void qemu_update_position(QEMUFile *f, size_t size)
> > >   int qemu_fclose(QEMUFile *f)
> > >   {
> > >       int ret;
> > > -    qemu_fflush(f);
> > > +
> > > +    if (qemu_file_is_writable(f) && f->buffered_mode) {
> > > +        ret = qemu_file_get_error(f);
> > > +        if (!ret) {
> > > +            flush_buffer(f);
> > > +        }
> > > +        /* wait until all tasks are done */
> > > +        aio_task_pool_wait_all(f->pool);
> > > +    } else {
> > > +        qemu_fflush(f);
> > > +    }
> > > +
> > >       ret = qemu_file_get_error(f);
> > >       if (f->ops->close) {
> > > @@ -408,16 +623,77 @@ int qemu_fclose(QEMUFile *f)
> > >           ret = f->last_error;
> > >       }
> > >       error_free(f->last_error_obj);
> > > -    g_free(f->current_buf->buf);
> > > -    g_free(f->current_buf->iov);
> > > -    g_free(f->current_buf->may_free);
> > > -    g_free(f->current_buf);
> > > +
> > > +    if (f->buffered_mode) {
> > > +        QEMUFileBuffer *fb, *next;
> > > +        /*
> > > +         * put the current back to the free buffers list
> > > +         * to destroy all the buffers in one loop
> > > +         */
> > > +        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
> > > +
> > > +        /* destroy all the buffers */
> > > +        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
> > > +            QLIST_REMOVE(fb, link);
> > > +            /* looks like qemu_vfree pairs with qemu_memalign */
> > > +            qemu_vfree(fb->buf);
> > > +            g_free(fb);
> > > +        }
> > > +        g_free(f->pool);
> > > +    } else {
> > > +        g_free(f->current_buf->buf);
> > > +        g_free(f->current_buf->iov);
> > > +        g_free(f->current_buf->may_free);
> > > +        g_free(f->current_buf);
> > > +    }
> > > +
> > >       g_free(f);
> > >       trace_qemu_file_fclose();
> > >       return ret;
> > >   }
> > >   /*
> > > + * Copy an external buffer to the intenal current buffer.
> > > + */
> > > +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
> > > +                     bool may_free)
> > > +{
> > > +    size_t data_size = size;
> > > +    const uint8_t *src_ptr = buf;
> > > +
> > > +    assert(f->buffered_mode);
> > > +    assert(size <= INT_MAX);
> > > +
> > > +    while (data_size > 0) {
> > > +        size_t chunk_size;
> > > +
> > > +        if (buf_is_full(f)) {
> > > +            flush_buffer(f);
> > > +            if (qemu_file_get_error(f)) {
> > > +                return;
> > > +            }
> > > +        }
> > > +
> > > +        chunk_size = MIN(get_buf_free_size(f), data_size);
> > > +
> > > +        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
> > > +
> > > +        advance_buf_ptr(f, chunk_size);
> > > +
> > > +        src_ptr += chunk_size;
> > > +        data_size -= chunk_size;
> > > +        f->bytes_xfer += chunk_size;
> > > +    }
> > > +
> > > +    if (may_free) {
> > > +        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
> > > +            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
> > > +                         buf, size, strerror(errno));
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +/*
> > >    * Add buf to iovec. Do flush if iovec is full.
> > >    *
> > >    * Return values:
> > > @@ -454,6 +730,9 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
> > >   static void add_buf_to_iovec(QEMUFile *f, size_t len)
> > >   {
> > >       QEMUFileBuffer *fb = f->current_buf;
> > > +
> > > +    assert(!f->buffered_mode);
> > > +
> > >       if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
> > >           fb->buf_index += len;
> > >           if (fb->buf_index == IO_BUF_SIZE) {
> > > @@ -469,8 +748,12 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
> > >           return;
> > >       }
> > > -    f->bytes_xfer += size;
> > > -    add_to_iovec(f, buf, size, may_free);
> > > +    if (f->buffered_mode) {
> > > +        copy_buf(f, buf, size, may_free);
> > > +    } else {
> > > +        f->bytes_xfer += size;
> > > +        add_to_iovec(f, buf, size, may_free);
> > > +    }
> > >   }
> > >   void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
> > > @@ -482,6 +765,11 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
> > >           return;
> > >       }
> > > +    if (f->buffered_mode) {
> > > +        copy_buf(f, buf, size, false);
> > > +        return;
> > > +    }
> > > +
> > >       while (size > 0) {
> > >           l = IO_BUF_SIZE - fb->buf_index;
> > >           if (l > size) {
> > > @@ -506,15 +794,21 @@ void qemu_put_byte(QEMUFile *f, int v)
> > >           return;
> > >       }
> > > -    fb->buf[fb->buf_index] = v;
> > > -    f->bytes_xfer++;
> > > -    add_buf_to_iovec(f, 1);
> > > +    if (f->buffered_mode) {
> > > +        copy_buf(f, (const uint8_t *) &v, 1, false);
> > > +    } else {
> > > +        fb->buf[fb->buf_index] = v;
> > > +        add_buf_to_iovec(f, 1);
> > > +        f->bytes_xfer++;
> > > +    }
> > >   }
> > >   void qemu_file_skip(QEMUFile *f, int size)
> > >   {
> > >       QEMUFileBuffer *fb = f->current_buf;
> > > +    assert(!f->buffered_mode);
> > > +
> > >       if (fb->buf_index + size <= fb->buf_size) {
> > >           fb->buf_index += size;
> > >       }
> > > @@ -672,10 +966,14 @@ int64_t qemu_ftell_fast(QEMUFile *f)
> > >   {
> > >       int64_t ret = f->pos;
> > >       int i;
> > > -    QEMUFileBuffer *fb = f->current_buf;
> > > -    for (i = 0; i < fb->iovcnt; i++) {
> > > -        ret += fb->iov[i].iov_len;
> > > +    if (f->buffered_mode) {
> > > +        ret += get_buf_used_size(f);
> > > +    } else {
> > > +        QEMUFileBuffer *fb = f->current_buf;
> > > +        for (i = 0; i < fb->iovcnt; i++) {
> > > +            ret += fb->iov[i].iov_len;
> > > +        }
> > >       }
> > >       return ret;
> > > @@ -683,8 +981,12 @@ int64_t qemu_ftell_fast(QEMUFile *f)
> > >   int64_t qemu_ftell(QEMUFile *f)
> > >   {
> > > -    qemu_fflush(f);
> > > -    return f->pos;
> > > +    if (f->buffered_mode) {
> > > +        return qemu_ftell_fast(f);
> > > +    } else {
> > > +        qemu_fflush(f);
> > > +        return f->pos;
> > > +    }
> > >   }
> > >   int qemu_file_rate_limit(QEMUFile *f)
> > > @@ -803,6 +1105,8 @@ ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
> > >       QEMUFileBuffer *fb = f->current_buf;
> > >       ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
> > > +    assert(!f->buffered_mode);
> > > +
> > >       if (blen < compressBound(size)) {
> > >           return -1;
> > >       }
> > > @@ -827,6 +1131,9 @@ int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src)
> > >       int len = 0;
> > >       QEMUFileBuffer *fb_src = f_src->current_buf;
> > > +    assert(!f_des->buffered_mode);
> > > +    assert(!f_src->buffered_mode);
> > > +
> > >       if (fb_src->buf_index > 0) {
> > >           len = fb_src->buf_index;
> > >           qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
> > > diff --git a/migration/qemu-file.h b/migration/qemu-file.h
> > > index a9b6d6c..08655d2 100644
> > > --- a/migration/qemu-file.h
> > > +++ b/migration/qemu-file.h
> > > @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
> > >   typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
> > >                                      Error **errp);
> > > +/*
> > > + * Enables or disables the buffered mode
> > > + * Existing blocking reads/writes must be woken
> > > + * Returns true if the buffered mode has to be enabled,
> > > + * false if it has to be disabled.
> > > + */
> > > +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
> > > +
> > >   typedef struct QEMUFileOps {
> > >       QEMUFileGetBufferFunc *get_buffer;
> > >       QEMUFileCloseFunc *close;
> > > @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
> > >       QEMUFileWritevBufferFunc *writev_buffer;
> > >       QEMURetPathFunc *get_return_path;
> > >       QEMUFileShutdownFunc *shut_down;
> > > +    QEMUFileEnableBufferedFunc *enable_buffered;
> > >   } QEMUFileOps;
> > >   typedef struct QEMUFileHooks {
> > > -- 
> > > 1.8.3.1
> > > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Denis Plotnikov April 28, 2020, 8:25 p.m. UTC | #9
On 28.04.2020 20:54, Dr. David Alan Gilbert wrote:
> * Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
>>
>> On 27.04.2020 15:14, Dr. David Alan Gilbert wrote:
>>> * Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
>>>> The patch adds ability to qemu-file to write the data
>>>> asynchronously to improve the performance on writing.
>>>> Before, only synchronous writing was supported.
>>>>
>>>> Enabling of the asyncronous mode is managed by new
>>>> "enabled_buffered" callback.
>>> It's a bit invasive isn't it - changes a lot of functions in a lot of
>>> places!
>> If you mean changing the qemu-file code - yes, it is.
> Yeh that's what I worry about; qemu-file is pretty complex as it is.
> Especially when it then passes it to the channel code etc
>
>> If you mean changing the qemu-file usage in the code - no.
>> The only place to change is the snapshot code when the buffered mode is
>> enabled with a callback.
>> The change is in 03 patch of the series.
> That's fine - that's easy.
>
>>> The multifd code separated the control headers from the data on separate
>>> fd's - but that doesn't help your case.
>> yes, that doesn't help
>>> Is there any chance you could do this by using the existing 'save_page'
>>> hook (that RDMA uses).
>> I don't think so. My goal is to improve writing performance of
>> the internal snapshot to qcow2 image. The snapshot is saved in qcow2 as
>> continuous stream placed in the end of address space.
>> To achieve the best writing speed I need a size and base-aligned buffer
>> containing the vm state (with ram) which looks like that (related to ram):
>>
>> ... | ram page header | ram page | ram page header | ram page | ... and so
>> on
>>
>> to store the buffer in qcow2 with a single operation.
>>
>> 'save_page' would allow me not to store 'ram page' in the qemu-file internal
>> structures,
>> and write my own ram page storing logic. I think that wouldn't help me a lot
>> because:
>> 1. I need a page with the ram page header
>> 2. I want to reduce the number of io operations
>> 3. I want to save other parts of vm state as fast as possible
>>
>> May be I can't see the better way of using 'save page' callback.
>> Could you suggest anything?
> I guess it depends if we care about keeping the format of the snapshot
> the same here;  if we were open to changing it, then we could use
> the save_page hook to delay the writes, so we'd have a pile of headers
> followed by a pile of pages.

I think we have to care about keeping the format. Because many users 
already have internal snapshots
saved in the qcow2 images, if we change the format we can't load 
snapshots from those images
as well as make snapshots non-readable for older qemu-s or we need to 
support two versions of format
which I think is too complicated.

>
>> Denis
>>> In the cover letter you mention direct qemu_fflush calls - have we got a
>>> few too many in some palces that you think we can clean out?
>> I'm not sure that some of them are excessive. To the best of my knowlege,
>> qemu-file is used for the source-destination communication on migration
>> and removing some qemu_fflush-es may break communication logic.
> I can't see any obvious places where it's called during the ram
> migration; can you try and give me a hint to where you're seeing it ?

I think those qemu_fflush-es aren't in the ram migration rather than in 
other vm state parts.
Although, those parts are quite small in comparison to ram, I saw quite 
a lot of qemu_fflush-es while debugging.
Still, we could benefit saving them with fewer number of io operation if 
we going to use buffered mode.

Denis

>
>> Snapshot is just a special case (if not the only) when we know that we can
>> do buffered (cached)
>> writings. Do you know any other cases when the buffered (cached) mode could
>> be useful?
> The RDMA code does it because it's really not good at small transfers,
> but maybe generally it would be a good idea to do larger writes if
> possible - something that multifd manages.
>
> Dave
>
>>> Dave
>>>
>>>> Signed-off-by: Denis Plotnikov <dplotnikov@virtuozzo.com>
>>>> ---
>>>>    include/qemu/typedefs.h |   1 +
>>>>    migration/qemu-file.c   | 351 +++++++++++++++++++++++++++++++++++++++++++++---
>>>>    migration/qemu-file.h   |   9 ++
>>>>    3 files changed, 339 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
>>>> index 88dce54..9b388c8 100644
>>>> --- a/include/qemu/typedefs.h
>>>> +++ b/include/qemu/typedefs.h
>>>> @@ -98,6 +98,7 @@ typedef struct QEMUBH QEMUBH;
>>>>    typedef struct QemuConsole QemuConsole;
>>>>    typedef struct QEMUFile QEMUFile;
>>>>    typedef struct QEMUFileBuffer QEMUFileBuffer;
>>>> +typedef struct QEMUFileAioTask QEMUFileAioTask;
>>>>    typedef struct QemuLockable QemuLockable;
>>>>    typedef struct QemuMutex QemuMutex;
>>>>    typedef struct QemuOpt QemuOpt;
>>>> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
>>>> index 285c6ef..f42f949 100644
>>>> --- a/migration/qemu-file.c
>>>> +++ b/migration/qemu-file.c
>>>> @@ -29,19 +29,25 @@
>>>>    #include "qemu-file.h"
>>>>    #include "trace.h"
>>>>    #include "qapi/error.h"
>>>> +#include "block/aio_task.h"
>>>> -#define IO_BUF_SIZE 32768
>>>> +#define IO_BUF_SIZE (1024 * 1024)
>>>>    #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
>>>> +#define IO_BUF_NUM 2
>>>> +#define IO_BUF_ALIGNMENT 512
>>>> -QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
>>>> +QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
>>>> +QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
>>>> +QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
>>>>    struct QEMUFileBuffer {
>>>>        int buf_index;
>>>> -    int buf_size; /* 0 when writing */
>>>> +    int buf_size; /* 0 when non-buffered writing */
>>>>        uint8_t *buf;
>>>>        unsigned long *may_free;
>>>>        struct iovec *iov;
>>>>        unsigned int iovcnt;
>>>> +    QLIST_ENTRY(QEMUFileBuffer) link;
>>>>    };
>>>>    struct QEMUFile {
>>>> @@ -60,6 +66,22 @@ struct QEMUFile {
>>>>        bool shutdown;
>>>>        /* currently used buffer */
>>>>        QEMUFileBuffer *current_buf;
>>>> +    /*
>>>> +     * with buffered_mode enabled all the data copied to 512 byte
>>>> +     * aligned buffer, including iov data. Then the buffer is passed
>>>> +     * to writev_buffer callback.
>>>> +     */
>>>> +    bool buffered_mode;
>>>> +    /* for async buffer writing */
>>>> +    AioTaskPool *pool;
>>>> +    /* the list of free buffers, currently used on is NOT there */
>>>> +    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
>>>> +};
>>>> +
>>>> +struct QEMUFileAioTask {
>>>> +    AioTask task;
>>>> +    QEMUFile *f;
>>>> +    QEMUFileBuffer *fb;
>>>>    };
>>>>    /*
>>>> @@ -115,10 +137,42 @@ QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
>>>>        f->opaque = opaque;
>>>>        f->ops = ops;
>>>> -    f->current_buf = g_new0(QEMUFileBuffer, 1);
>>>> -    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>>>> -    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>>>> -    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>>>> +    if (f->ops->enable_buffered) {
>>>> +        f->buffered_mode = f->ops->enable_buffered(f->opaque);
>>>> +    }
>>>> +
>>>> +    if (f->buffered_mode && qemu_file_is_writable(f)) {
>>>> +        int i;
>>>> +        /*
>>>> +         * in buffered_mode we don't use internal io vectors
>>>> +         * and may_free bitmap, because we copy the data to be
>>>> +         * written right away to the buffer
>>>> +         */
>>>> +        f->pool = aio_task_pool_new(IO_BUF_NUM);
>>>> +
>>>> +        /* allocate io buffers */
>>>> +        for (i = 0; i < IO_BUF_NUM; i++) {
>>>> +            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
>>>> +
>>>> +            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
>>>> +            fb->buf_size = IO_BUF_SIZE;
>>>> +
>>>> +            /*
>>>> +             * put the first buffer to the current buf and the rest
>>>> +             * to the list of free buffers
>>>> +             */
>>>> +            if (i == 0) {
>>>> +                f->current_buf = fb;
>>>> +            } else {
>>>> +                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>>>> +            }
>>>> +        }
>>>> +    } else {
>>>> +        f->current_buf = g_new0(QEMUFileBuffer, 1);
>>>> +        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
>>>> +        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
>>>> +        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
>>>> +    }
>>>>        return f;
>>>>    }
>>>> @@ -190,6 +244,8 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>>>        unsigned long idx;
>>>>        QEMUFileBuffer *fb = f->current_buf;
>>>> +    assert(!f->buffered_mode);
>>>> +
>>>>        /* Find and release all the contiguous memory ranges marked as may_free. */
>>>>        idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
>>>>        if (idx >= fb->iovcnt) {
>>>> @@ -221,6 +277,147 @@ static void qemu_iovec_release_ram(QEMUFile *f)
>>>>        bitmap_zero(fb->may_free, MAX_IOV_SIZE);
>>>>    }
>>>> +static void advance_buf_ptr(QEMUFile *f, size_t size)
>>>> +{
>>>> +    QEMUFileBuffer *fb = f->current_buf;
>>>> +    /* must not advance to 0 */
>>>> +    assert(size);
>>>> +    /* must not overflow buf_index (int) */
>>>> +    assert(fb->buf_index + size <= INT_MAX);
>>>> +    /* must not exceed buf_size */
>>>> +    assert(fb->buf_index + size <= fb->buf_size);
>>>> +
>>>> +    fb->buf_index += size;
>>>> +}
>>>> +
>>>> +static size_t get_buf_free_size(QEMUFile *f)
>>>> +{
>>>> +    QEMUFileBuffer *fb = f->current_buf;
>>>> +    /* buf_index can't be greated than buf_size */
>>>> +    assert(fb->buf_size >= fb->buf_index);
>>>> +    return fb->buf_size - fb->buf_index;
>>>> +}
>>>> +
>>>> +static size_t get_buf_used_size(QEMUFile *f)
>>>> +{
>>>> +    QEMUFileBuffer *fb = f->current_buf;
>>>> +    return fb->buf_index;
>>>> +}
>>>> +
>>>> +static uint8_t *get_buf_ptr(QEMUFile *f)
>>>> +{
>>>> +    QEMUFileBuffer *fb = f->current_buf;
>>>> +    /* protects from out of bound reading */
>>>> +    assert(fb->buf_index <= IO_BUF_SIZE);
>>>> +    return fb->buf + fb->buf_index;
>>>> +}
>>>> +
>>>> +static bool buf_is_full(QEMUFile *f)
>>>> +{
>>>> +    return get_buf_free_size(f) == 0;
>>>> +}
>>>> +
>>>> +static void reset_buf(QEMUFile *f)
>>>> +{
>>>> +    QEMUFileBuffer *fb = f->current_buf;
>>>> +    fb->buf_index = 0;
>>>> +}
>>>> +
>>>> +static int write_task_fn(AioTask *task)
>>>> +{
>>>> +    int ret;
>>>> +    Error *local_error = NULL;
>>>> +    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
>>>> +    QEMUFile *f = t->f;
>>>> +    QEMUFileBuffer *fb = t->fb;
>>>> +    uint64_t pos = f->pos;
>>>> +    struct iovec v = (struct iovec) {
>>>> +        .iov_base = fb->buf,
>>>> +        .iov_len = fb->buf_index,
>>>> +    };
>>>> +
>>>> +    assert(f->buffered_mode);
>>>> +
>>>> +    /*
>>>> +     * Increment file position.
>>>> +     * This needs to be here before calling writev_buffer, because
>>>> +     * writev_buffer is asynchronous and there could be more than one
>>>> +     * writev_buffer started simultaniously. Each writev_buffer should
>>>> +     * use its own file pos to write to. writev_buffer may write less
>>>> +     * than buf_index bytes but we treat this situation as an error.
>>>> +     * If error appeared, further file using is meaningless.
>>>> +     * We expect that, the most of the time the full buffer is written,
>>>> +     * (when buf_size == buf_index). The only case when the non-full
>>>> +     * buffer is written (buf_size != buf_index) is file close,
>>>> +     * when we need to flush the rest of the buffer content.
>>>> +     */
>>>> +    f->pos += fb->buf_index;
>>>> +
>>>> +    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
>>>> +
>>>> +    /* return the just written buffer to the free list */
>>>> +    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
>>>> +
>>>> +    /* check that we have written everything */
>>>> +    if (ret != fb->buf_index) {
>>>> +        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * always return 0 - don't use task error handling, relay on
>>>> +     * qemu file error handling
>>>> +     */
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void qemu_file_switch_current_buf(QEMUFile *f)
>>>> +{
>>>> +    /*
>>>> +     * if the list is empty, wait until some task returns a buffer
>>>> +     * to the list of free buffers.
>>>> +     */
>>>> +    if (QLIST_EMPTY(&f->free_buffers)) {
>>>> +        aio_task_pool_wait_slot(f->pool);
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * sanity check that the list isn't empty
>>>> +     * if the free list was empty, we waited for a task complition,
>>>> +     * and the pompleted task must return a buffer to a list of free buffers
>>>> +     */
>>>> +    assert(!QLIST_EMPTY(&f->free_buffers));
>>>> +
>>>> +    /* set the current buffer for using from the free list */
>>>> +    f->current_buf = QLIST_FIRST(&f->free_buffers);
>>>> +    reset_buf(f);
>>>> +
>>>> +    QLIST_REMOVE(f->current_buf, link);
>>>> +}
>>>> +
>>>> +/**
>>>> + *  Asynchronously flushes QEMUFile buffer
>>>> + *
>>>> + * This will flush all pending data. If data was only partially flushed, it
>>>> + * will set an error state. The function may return before the data actually
>>>> + * written.
>>>> + */
>>>> +static void flush_buffer(QEMUFile *f)
>>>> +{
>>>> +    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
>>>> +
>>>> +    *t = (QEMUFileAioTask) {
>>>> +        .task.func = &write_task_fn,
>>>> +        .f = f,
>>>> +        .fb = f->current_buf,
>>>> +    };
>>>> +
>>>> +    /* aio_task_pool should free t for us */
>>>> +    aio_task_pool_start_task(f->pool, (AioTask *) t);
>>>> +
>>>> +    /* if no errors this will switch the buffer */
>>>> +    qemu_file_switch_current_buf(f);
>>>> +}
>>>> +
>>>>    /**
>>>>     * Flushes QEMUFile buffer
>>>>     *
>>>> @@ -241,7 +438,13 @@ void qemu_fflush(QEMUFile *f)
>>>>        if (f->shutdown) {
>>>>            return;
>>>>        }
>>>> +
>>>> +    if (f->buffered_mode) {
>>>> +        return;
>>>> +    }
>>>> +
>>>>        if (fb->iovcnt > 0) {
>>>> +        /* this is non-buffered mode */
>>>>            expect = iov_size(fb->iov, fb->iovcnt);
>>>>            ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
>>>>                                        &local_error);
>>>> @@ -378,6 +581,7 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
>>>>    void qemu_update_position(QEMUFile *f, size_t size)
>>>>    {
>>>> +    assert(!f->buffered_mode);
>>>>        f->pos += size;
>>>>    }
>>>> @@ -392,7 +596,18 @@ void qemu_update_position(QEMUFile *f, size_t size)
>>>>    int qemu_fclose(QEMUFile *f)
>>>>    {
>>>>        int ret;
>>>> -    qemu_fflush(f);
>>>> +
>>>> +    if (qemu_file_is_writable(f) && f->buffered_mode) {
>>>> +        ret = qemu_file_get_error(f);
>>>> +        if (!ret) {
>>>> +            flush_buffer(f);
>>>> +        }
>>>> +        /* wait until all tasks are done */
>>>> +        aio_task_pool_wait_all(f->pool);
>>>> +    } else {
>>>> +        qemu_fflush(f);
>>>> +    }
>>>> +
>>>>        ret = qemu_file_get_error(f);
>>>>        if (f->ops->close) {
>>>> @@ -408,16 +623,77 @@ int qemu_fclose(QEMUFile *f)
>>>>            ret = f->last_error;
>>>>        }
>>>>        error_free(f->last_error_obj);
>>>> -    g_free(f->current_buf->buf);
>>>> -    g_free(f->current_buf->iov);
>>>> -    g_free(f->current_buf->may_free);
>>>> -    g_free(f->current_buf);
>>>> +
>>>> +    if (f->buffered_mode) {
>>>> +        QEMUFileBuffer *fb, *next;
>>>> +        /*
>>>> +         * put the current back to the free buffers list
>>>> +         * to destroy all the buffers in one loop
>>>> +         */
>>>> +        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
>>>> +
>>>> +        /* destroy all the buffers */
>>>> +        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
>>>> +            QLIST_REMOVE(fb, link);
>>>> +            /* looks like qemu_vfree pairs with qemu_memalign */
>>>> +            qemu_vfree(fb->buf);
>>>> +            g_free(fb);
>>>> +        }
>>>> +        g_free(f->pool);
>>>> +    } else {
>>>> +        g_free(f->current_buf->buf);
>>>> +        g_free(f->current_buf->iov);
>>>> +        g_free(f->current_buf->may_free);
>>>> +        g_free(f->current_buf);
>>>> +    }
>>>> +
>>>>        g_free(f);
>>>>        trace_qemu_file_fclose();
>>>>        return ret;
>>>>    }
>>>>    /*
>>>> + * Copy an external buffer to the intenal current buffer.
>>>> + */
>>>> +static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
>>>> +                     bool may_free)
>>>> +{
>>>> +    size_t data_size = size;
>>>> +    const uint8_t *src_ptr = buf;
>>>> +
>>>> +    assert(f->buffered_mode);
>>>> +    assert(size <= INT_MAX);
>>>> +
>>>> +    while (data_size > 0) {
>>>> +        size_t chunk_size;
>>>> +
>>>> +        if (buf_is_full(f)) {
>>>> +            flush_buffer(f);
>>>> +            if (qemu_file_get_error(f)) {
>>>> +                return;
>>>> +            }
>>>> +        }
>>>> +
>>>> +        chunk_size = MIN(get_buf_free_size(f), data_size);
>>>> +
>>>> +        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
>>>> +
>>>> +        advance_buf_ptr(f, chunk_size);
>>>> +
>>>> +        src_ptr += chunk_size;
>>>> +        data_size -= chunk_size;
>>>> +        f->bytes_xfer += chunk_size;
>>>> +    }
>>>> +
>>>> +    if (may_free) {
>>>> +        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
>>>> +            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
>>>> +                         buf, size, strerror(errno));
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +/*
>>>>     * Add buf to iovec. Do flush if iovec is full.
>>>>     *
>>>>     * Return values:
>>>> @@ -454,6 +730,9 @@ static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
>>>>    static void add_buf_to_iovec(QEMUFile *f, size_t len)
>>>>    {
>>>>        QEMUFileBuffer *fb = f->current_buf;
>>>> +
>>>> +    assert(!f->buffered_mode);
>>>> +
>>>>        if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
>>>>            fb->buf_index += len;
>>>>            if (fb->buf_index == IO_BUF_SIZE) {
>>>> @@ -469,8 +748,12 @@ void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
>>>>            return;
>>>>        }
>>>> -    f->bytes_xfer += size;
>>>> -    add_to_iovec(f, buf, size, may_free);
>>>> +    if (f->buffered_mode) {
>>>> +        copy_buf(f, buf, size, may_free);
>>>> +    } else {
>>>> +        f->bytes_xfer += size;
>>>> +        add_to_iovec(f, buf, size, may_free);
>>>> +    }
>>>>    }
>>>>    void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>>>> @@ -482,6 +765,11 @@ void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
>>>>            return;
>>>>        }
>>>> +    if (f->buffered_mode) {
>>>> +        copy_buf(f, buf, size, false);
>>>> +        return;
>>>> +    }
>>>> +
>>>>        while (size > 0) {
>>>>            l = IO_BUF_SIZE - fb->buf_index;
>>>>            if (l > size) {
>>>> @@ -506,15 +794,21 @@ void qemu_put_byte(QEMUFile *f, int v)
>>>>            return;
>>>>        }
>>>> -    fb->buf[fb->buf_index] = v;
>>>> -    f->bytes_xfer++;
>>>> -    add_buf_to_iovec(f, 1);
>>>> +    if (f->buffered_mode) {
>>>> +        copy_buf(f, (const uint8_t *) &v, 1, false);
>>>> +    } else {
>>>> +        fb->buf[fb->buf_index] = v;
>>>> +        add_buf_to_iovec(f, 1);
>>>> +        f->bytes_xfer++;
>>>> +    }
>>>>    }
>>>>    void qemu_file_skip(QEMUFile *f, int size)
>>>>    {
>>>>        QEMUFileBuffer *fb = f->current_buf;
>>>> +    assert(!f->buffered_mode);
>>>> +
>>>>        if (fb->buf_index + size <= fb->buf_size) {
>>>>            fb->buf_index += size;
>>>>        }
>>>> @@ -672,10 +966,14 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>>>>    {
>>>>        int64_t ret = f->pos;
>>>>        int i;
>>>> -    QEMUFileBuffer *fb = f->current_buf;
>>>> -    for (i = 0; i < fb->iovcnt; i++) {
>>>> -        ret += fb->iov[i].iov_len;
>>>> +    if (f->buffered_mode) {
>>>> +        ret += get_buf_used_size(f);
>>>> +    } else {
>>>> +        QEMUFileBuffer *fb = f->current_buf;
>>>> +        for (i = 0; i < fb->iovcnt; i++) {
>>>> +            ret += fb->iov[i].iov_len;
>>>> +        }
>>>>        }
>>>>        return ret;
>>>> @@ -683,8 +981,12 @@ int64_t qemu_ftell_fast(QEMUFile *f)
>>>>    int64_t qemu_ftell(QEMUFile *f)
>>>>    {
>>>> -    qemu_fflush(f);
>>>> -    return f->pos;
>>>> +    if (f->buffered_mode) {
>>>> +        return qemu_ftell_fast(f);
>>>> +    } else {
>>>> +        qemu_fflush(f);
>>>> +        return f->pos;
>>>> +    }
>>>>    }
>>>>    int qemu_file_rate_limit(QEMUFile *f)
>>>> @@ -803,6 +1105,8 @@ ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
>>>>        QEMUFileBuffer *fb = f->current_buf;
>>>>        ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
>>>> +    assert(!f->buffered_mode);
>>>> +
>>>>        if (blen < compressBound(size)) {
>>>>            return -1;
>>>>        }
>>>> @@ -827,6 +1131,9 @@ int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src)
>>>>        int len = 0;
>>>>        QEMUFileBuffer *fb_src = f_src->current_buf;
>>>> +    assert(!f_des->buffered_mode);
>>>> +    assert(!f_src->buffered_mode);
>>>> +
>>>>        if (fb_src->buf_index > 0) {
>>>>            len = fb_src->buf_index;
>>>>            qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
>>>> diff --git a/migration/qemu-file.h b/migration/qemu-file.h
>>>> index a9b6d6c..08655d2 100644
>>>> --- a/migration/qemu-file.h
>>>> +++ b/migration/qemu-file.h
>>>> @@ -103,6 +103,14 @@ typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
>>>>    typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
>>>>                                       Error **errp);
>>>> +/*
>>>> + * Enables or disables the buffered mode
>>>> + * Existing blocking reads/writes must be woken
>>>> + * Returns true if the buffered mode has to be enabled,
>>>> + * false if it has to be disabled.
>>>> + */
>>>> +typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
>>>> +
>>>>    typedef struct QEMUFileOps {
>>>>        QEMUFileGetBufferFunc *get_buffer;
>>>>        QEMUFileCloseFunc *close;
>>>> @@ -110,6 +118,7 @@ typedef struct QEMUFileOps {
>>>>        QEMUFileWritevBufferFunc *writev_buffer;
>>>>        QEMURetPathFunc *get_return_path;
>>>>        QEMUFileShutdownFunc *shut_down;
>>>> +    QEMUFileEnableBufferedFunc *enable_buffered;
>>>>    } QEMUFileOps;
>>>>    typedef struct QEMUFileHooks {
>>>> -- 
>>>> 1.8.3.1
>>>>
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Daniel P. Berrangé May 4, 2020, 9:08 a.m. UTC | #10
On Mon, Apr 27, 2020 at 01:14:33PM +0100, Dr. David Alan Gilbert wrote:
> * Denis Plotnikov (dplotnikov@virtuozzo.com) wrote:
> > The patch adds ability to qemu-file to write the data
> > asynchronously to improve the performance on writing.
> > Before, only synchronous writing was supported.
> > 
> > Enabling of the asyncronous mode is managed by new
> > "enabled_buffered" callback.
> 
> It's a bit invasive isn't it - changes a lot of functions in a lot of
> places!
> The multifd code separated the control headers from the data on separate
> fd's - but that doesn't help your case.
> 
> Is there any chance you could do this by using the existing 'save_page'
> hook (that RDMA uses).
> 
> In the cover letter you mention direct qemu_fflush calls - have we got a
> few too many in some palces that you think we can clean out?

When I first introduced the QIOChannel framework, I hoped that we could
largely eliminate QEMUFile as a concept.  Thus I'm a bit suspicious of
the idea of introducing more functionality to QEMUFile, especially as the
notion of buffering I/O is rather generic. Is there scope for having a
QIOChannelBuffered object for doing buffering. Would that provide better
isolation from the migration code and thus be less invasive/complex to
maintain ?


Regards,
Daniel
diff mbox series

Patch

diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 88dce54..9b388c8 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -98,6 +98,7 @@  typedef struct QEMUBH QEMUBH;
 typedef struct QemuConsole QemuConsole;
 typedef struct QEMUFile QEMUFile;
 typedef struct QEMUFileBuffer QEMUFileBuffer;
+typedef struct QEMUFileAioTask QEMUFileAioTask;
 typedef struct QemuLockable QemuLockable;
 typedef struct QemuMutex QemuMutex;
 typedef struct QemuOpt QemuOpt;
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 285c6ef..f42f949 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -29,19 +29,25 @@ 
 #include "qemu-file.h"
 #include "trace.h"
 #include "qapi/error.h"
+#include "block/aio_task.h"
 
-#define IO_BUF_SIZE 32768
+#define IO_BUF_SIZE (1024 * 1024)
 #define MAX_IOV_SIZE MIN(IOV_MAX, 64)
+#define IO_BUF_NUM 2
+#define IO_BUF_ALIGNMENT 512
 
-QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, 512));
+QEMU_BUILD_BUG_ON(!QEMU_IS_ALIGNED(IO_BUF_SIZE, IO_BUF_ALIGNMENT));
+QEMU_BUILD_BUG_ON(IO_BUF_SIZE > INT_MAX);
+QEMU_BUILD_BUG_ON(IO_BUF_NUM <= 0);
 
 struct QEMUFileBuffer {
     int buf_index;
-    int buf_size; /* 0 when writing */
+    int buf_size; /* 0 when non-buffered writing */
     uint8_t *buf;
     unsigned long *may_free;
     struct iovec *iov;
     unsigned int iovcnt;
+    QLIST_ENTRY(QEMUFileBuffer) link;
 };
 
 struct QEMUFile {
@@ -60,6 +66,22 @@  struct QEMUFile {
     bool shutdown;
     /* currently used buffer */
     QEMUFileBuffer *current_buf;
+    /*
+     * with buffered_mode enabled all the data copied to 512 byte
+     * aligned buffer, including iov data. Then the buffer is passed
+     * to writev_buffer callback.
+     */
+    bool buffered_mode;
+    /* for async buffer writing */
+    AioTaskPool *pool;
+    /* the list of free buffers, currently used on is NOT there */
+    QLIST_HEAD(, QEMUFileBuffer) free_buffers;
+};
+
+struct QEMUFileAioTask {
+    AioTask task;
+    QEMUFile *f;
+    QEMUFileBuffer *fb;
 };
 
 /*
@@ -115,10 +137,42 @@  QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops)
     f->opaque = opaque;
     f->ops = ops;
 
-    f->current_buf = g_new0(QEMUFileBuffer, 1);
-    f->current_buf->buf = g_malloc(IO_BUF_SIZE);
-    f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
-    f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
+    if (f->ops->enable_buffered) {
+        f->buffered_mode = f->ops->enable_buffered(f->opaque);
+    }
+
+    if (f->buffered_mode && qemu_file_is_writable(f)) {
+        int i;
+        /*
+         * in buffered_mode we don't use internal io vectors
+         * and may_free bitmap, because we copy the data to be
+         * written right away to the buffer
+         */
+        f->pool = aio_task_pool_new(IO_BUF_NUM);
+
+        /* allocate io buffers */
+        for (i = 0; i < IO_BUF_NUM; i++) {
+            QEMUFileBuffer *fb = g_new0(QEMUFileBuffer, 1);
+
+            fb->buf = qemu_memalign(IO_BUF_ALIGNMENT, IO_BUF_SIZE);
+            fb->buf_size = IO_BUF_SIZE;
+
+            /*
+             * put the first buffer to the current buf and the rest
+             * to the list of free buffers
+             */
+            if (i == 0) {
+                f->current_buf = fb;
+            } else {
+                QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
+            }
+        }
+    } else {
+        f->current_buf = g_new0(QEMUFileBuffer, 1);
+        f->current_buf->buf = g_malloc(IO_BUF_SIZE);
+        f->current_buf->iov = g_new0(struct iovec, MAX_IOV_SIZE);
+        f->current_buf->may_free = bitmap_new(MAX_IOV_SIZE);
+    }
 
     return f;
 }
@@ -190,6 +244,8 @@  static void qemu_iovec_release_ram(QEMUFile *f)
     unsigned long idx;
     QEMUFileBuffer *fb = f->current_buf;
 
+    assert(!f->buffered_mode);
+
     /* Find and release all the contiguous memory ranges marked as may_free. */
     idx = find_next_bit(fb->may_free, fb->iovcnt, 0);
     if (idx >= fb->iovcnt) {
@@ -221,6 +277,147 @@  static void qemu_iovec_release_ram(QEMUFile *f)
     bitmap_zero(fb->may_free, MAX_IOV_SIZE);
 }
 
+static void advance_buf_ptr(QEMUFile *f, size_t size)
+{
+    QEMUFileBuffer *fb = f->current_buf;
+    /* must not advance to 0 */
+    assert(size);
+    /* must not overflow buf_index (int) */
+    assert(fb->buf_index + size <= INT_MAX);
+    /* must not exceed buf_size */
+    assert(fb->buf_index + size <= fb->buf_size);
+
+    fb->buf_index += size;
+}
+
+static size_t get_buf_free_size(QEMUFile *f)
+{
+    QEMUFileBuffer *fb = f->current_buf;
+    /* buf_index can't be greated than buf_size */
+    assert(fb->buf_size >= fb->buf_index);
+    return fb->buf_size - fb->buf_index;
+}
+
+static size_t get_buf_used_size(QEMUFile *f)
+{
+    QEMUFileBuffer *fb = f->current_buf;
+    return fb->buf_index;
+}
+
+static uint8_t *get_buf_ptr(QEMUFile *f)
+{
+    QEMUFileBuffer *fb = f->current_buf;
+    /* protects from out of bound reading */
+    assert(fb->buf_index <= IO_BUF_SIZE);
+    return fb->buf + fb->buf_index;
+}
+
+static bool buf_is_full(QEMUFile *f)
+{
+    return get_buf_free_size(f) == 0;
+}
+
+static void reset_buf(QEMUFile *f)
+{
+    QEMUFileBuffer *fb = f->current_buf;
+    fb->buf_index = 0;
+}
+
+static int write_task_fn(AioTask *task)
+{
+    int ret;
+    Error *local_error = NULL;
+    QEMUFileAioTask *t = (QEMUFileAioTask *) task;
+    QEMUFile *f = t->f;
+    QEMUFileBuffer *fb = t->fb;
+    uint64_t pos = f->pos;
+    struct iovec v = (struct iovec) {
+        .iov_base = fb->buf,
+        .iov_len = fb->buf_index,
+    };
+
+    assert(f->buffered_mode);
+
+    /*
+     * Increment file position.
+     * This needs to be here before calling writev_buffer, because
+     * writev_buffer is asynchronous and there could be more than one
+     * writev_buffer started simultaniously. Each writev_buffer should
+     * use its own file pos to write to. writev_buffer may write less
+     * than buf_index bytes but we treat this situation as an error.
+     * If error appeared, further file using is meaningless.
+     * We expect that, the most of the time the full buffer is written,
+     * (when buf_size == buf_index). The only case when the non-full
+     * buffer is written (buf_size != buf_index) is file close,
+     * when we need to flush the rest of the buffer content.
+     */
+    f->pos += fb->buf_index;
+
+    ret = f->ops->writev_buffer(f->opaque, &v, 1, pos, &local_error);
+
+    /* return the just written buffer to the free list */
+    QLIST_INSERT_HEAD(&f->free_buffers, fb, link);
+
+    /* check that we have written everything */
+    if (ret != fb->buf_index) {
+        qemu_file_set_error_obj(f, ret < 0 ? ret : -EIO, local_error);
+    }
+
+    /*
+     * always return 0 - don't use task error handling, relay on
+     * qemu file error handling
+     */
+    return 0;
+}
+
+static void qemu_file_switch_current_buf(QEMUFile *f)
+{
+    /*
+     * if the list is empty, wait until some task returns a buffer
+     * to the list of free buffers.
+     */
+    if (QLIST_EMPTY(&f->free_buffers)) {
+        aio_task_pool_wait_slot(f->pool);
+    }
+
+    /*
+     * sanity check that the list isn't empty
+     * if the free list was empty, we waited for a task complition,
+     * and the pompleted task must return a buffer to a list of free buffers
+     */
+    assert(!QLIST_EMPTY(&f->free_buffers));
+
+    /* set the current buffer for using from the free list */
+    f->current_buf = QLIST_FIRST(&f->free_buffers);
+    reset_buf(f);
+
+    QLIST_REMOVE(f->current_buf, link);
+}
+
+/**
+ *  Asynchronously flushes QEMUFile buffer
+ *
+ * This will flush all pending data. If data was only partially flushed, it
+ * will set an error state. The function may return before the data actually
+ * written.
+ */
+static void flush_buffer(QEMUFile *f)
+{
+    QEMUFileAioTask *t = g_new(QEMUFileAioTask, 1);
+
+    *t = (QEMUFileAioTask) {
+        .task.func = &write_task_fn,
+        .f = f,
+        .fb = f->current_buf,
+    };
+
+    /* aio_task_pool should free t for us */
+    aio_task_pool_start_task(f->pool, (AioTask *) t);
+
+    /* if no errors this will switch the buffer */
+    qemu_file_switch_current_buf(f);
+}
+
 /**
  * Flushes QEMUFile buffer
  *
@@ -241,7 +438,13 @@  void qemu_fflush(QEMUFile *f)
     if (f->shutdown) {
         return;
     }
+
+    if (f->buffered_mode) {
+        return;
+    }
+
     if (fb->iovcnt > 0) {
+        /* this is non-buffered mode */
         expect = iov_size(fb->iov, fb->iovcnt);
         ret = f->ops->writev_buffer(f->opaque, fb->iov, fb->iovcnt, f->pos,
                                     &local_error);
@@ -378,6 +581,7 @@  static ssize_t qemu_fill_buffer(QEMUFile *f)
 
 void qemu_update_position(QEMUFile *f, size_t size)
 {
+    assert(!f->buffered_mode);
     f->pos += size;
 }
 
@@ -392,7 +596,18 @@  void qemu_update_position(QEMUFile *f, size_t size)
 int qemu_fclose(QEMUFile *f)
 {
     int ret;
-    qemu_fflush(f);
+
+    if (qemu_file_is_writable(f) && f->buffered_mode) {
+        ret = qemu_file_get_error(f);
+        if (!ret) {
+            flush_buffer(f);
+        }
+        /* wait until all tasks are done */
+        aio_task_pool_wait_all(f->pool);
+    } else {
+        qemu_fflush(f);
+    }
+
     ret = qemu_file_get_error(f);
 
     if (f->ops->close) {
@@ -408,16 +623,77 @@  int qemu_fclose(QEMUFile *f)
         ret = f->last_error;
     }
     error_free(f->last_error_obj);
-    g_free(f->current_buf->buf);
-    g_free(f->current_buf->iov);
-    g_free(f->current_buf->may_free);
-    g_free(f->current_buf);
+
+    if (f->buffered_mode) {
+        QEMUFileBuffer *fb, *next;
+        /*
+         * put the current back to the free buffers list
+         * to destroy all the buffers in one loop
+         */
+        QLIST_INSERT_HEAD(&f->free_buffers, f->current_buf, link);
+
+        /* destroy all the buffers */
+        QLIST_FOREACH_SAFE(fb, &f->free_buffers, link, next) {
+            QLIST_REMOVE(fb, link);
+            /* looks like qemu_vfree pairs with qemu_memalign */
+            qemu_vfree(fb->buf);
+            g_free(fb);
+        }
+        g_free(f->pool);
+    } else {
+        g_free(f->current_buf->buf);
+        g_free(f->current_buf->iov);
+        g_free(f->current_buf->may_free);
+        g_free(f->current_buf);
+    }
+
     g_free(f);
     trace_qemu_file_fclose();
     return ret;
 }
 
 /*
+ * Copy an external buffer to the intenal current buffer.
+ */
+static void copy_buf(QEMUFile *f, const uint8_t *buf, size_t size,
+                     bool may_free)
+{
+    size_t data_size = size;
+    const uint8_t *src_ptr = buf;
+
+    assert(f->buffered_mode);
+    assert(size <= INT_MAX);
+
+    while (data_size > 0) {
+        size_t chunk_size;
+
+        if (buf_is_full(f)) {
+            flush_buffer(f);
+            if (qemu_file_get_error(f)) {
+                return;
+            }
+        }
+
+        chunk_size = MIN(get_buf_free_size(f), data_size);
+
+        memcpy(get_buf_ptr(f), src_ptr, chunk_size);
+
+        advance_buf_ptr(f, chunk_size);
+
+        src_ptr += chunk_size;
+        data_size -= chunk_size;
+        f->bytes_xfer += chunk_size;
+    }
+
+    if (may_free) {
+        if (qemu_madvise((void *) buf, size, QEMU_MADV_DONTNEED) < 0) {
+            error_report("migrate: madvise DONTNEED failed %p %zd: %s",
+                         buf, size, strerror(errno));
+        }
+    }
+}
+
+/*
  * Add buf to iovec. Do flush if iovec is full.
  *
  * Return values:
@@ -454,6 +730,9 @@  static int add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size,
 static void add_buf_to_iovec(QEMUFile *f, size_t len)
 {
     QEMUFileBuffer *fb = f->current_buf;
+
+    assert(!f->buffered_mode);
+
     if (!add_to_iovec(f, fb->buf + fb->buf_index, len, false)) {
         fb->buf_index += len;
         if (fb->buf_index == IO_BUF_SIZE) {
@@ -469,8 +748,12 @@  void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size,
         return;
     }
 
-    f->bytes_xfer += size;
-    add_to_iovec(f, buf, size, may_free);
+    if (f->buffered_mode) {
+        copy_buf(f, buf, size, may_free);
+    } else {
+        f->bytes_xfer += size;
+        add_to_iovec(f, buf, size, may_free);
+    }
 }
 
 void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
@@ -482,6 +765,11 @@  void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size)
         return;
     }
 
+    if (f->buffered_mode) {
+        copy_buf(f, buf, size, false);
+        return;
+    }
+
     while (size > 0) {
         l = IO_BUF_SIZE - fb->buf_index;
         if (l > size) {
@@ -506,15 +794,21 @@  void qemu_put_byte(QEMUFile *f, int v)
         return;
     }
 
-    fb->buf[fb->buf_index] = v;
-    f->bytes_xfer++;
-    add_buf_to_iovec(f, 1);
+    if (f->buffered_mode) {
+        copy_buf(f, (const uint8_t *) &v, 1, false);
+    } else {
+        fb->buf[fb->buf_index] = v;
+        add_buf_to_iovec(f, 1);
+        f->bytes_xfer++;
+    }
 }
 
 void qemu_file_skip(QEMUFile *f, int size)
 {
     QEMUFileBuffer *fb = f->current_buf;
 
+    assert(!f->buffered_mode);
+
     if (fb->buf_index + size <= fb->buf_size) {
         fb->buf_index += size;
     }
@@ -672,10 +966,14 @@  int64_t qemu_ftell_fast(QEMUFile *f)
 {
     int64_t ret = f->pos;
     int i;
-    QEMUFileBuffer *fb = f->current_buf;
 
-    for (i = 0; i < fb->iovcnt; i++) {
-        ret += fb->iov[i].iov_len;
+    if (f->buffered_mode) {
+        ret += get_buf_used_size(f);
+    } else {
+        QEMUFileBuffer *fb = f->current_buf;
+        for (i = 0; i < fb->iovcnt; i++) {
+            ret += fb->iov[i].iov_len;
+        }
     }
 
     return ret;
@@ -683,8 +981,12 @@  int64_t qemu_ftell_fast(QEMUFile *f)
 
 int64_t qemu_ftell(QEMUFile *f)
 {
-    qemu_fflush(f);
-    return f->pos;
+    if (f->buffered_mode) {
+        return qemu_ftell_fast(f);
+    } else {
+        qemu_fflush(f);
+        return f->pos;
+    }
 }
 
 int qemu_file_rate_limit(QEMUFile *f)
@@ -803,6 +1105,8 @@  ssize_t qemu_put_compression_data(QEMUFile *f, z_stream *stream,
     QEMUFileBuffer *fb = f->current_buf;
     ssize_t blen = IO_BUF_SIZE - fb->buf_index - sizeof(int32_t);
 
+    assert(!f->buffered_mode);
+
     if (blen < compressBound(size)) {
         return -1;
     }
@@ -827,6 +1131,9 @@  int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src)
     int len = 0;
     QEMUFileBuffer *fb_src = f_src->current_buf;
 
+    assert(!f_des->buffered_mode);
+    assert(!f_src->buffered_mode);
+
     if (fb_src->buf_index > 0) {
         len = fb_src->buf_index;
         qemu_put_buffer(f_des, fb_src->buf, fb_src->buf_index);
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index a9b6d6c..08655d2 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -103,6 +103,14 @@  typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
 typedef int (QEMUFileShutdownFunc)(void *opaque, bool rd, bool wr,
                                    Error **errp);
 
+/*
+ * Enables or disables the buffered mode
+ * Existing blocking reads/writes must be woken
+ * Returns true if the buffered mode has to be enabled,
+ * false if it has to be disabled.
+ */
+typedef bool (QEMUFileEnableBufferedFunc)(void *opaque);
+
 typedef struct QEMUFileOps {
     QEMUFileGetBufferFunc *get_buffer;
     QEMUFileCloseFunc *close;
@@ -110,6 +118,7 @@  typedef struct QEMUFileOps {
     QEMUFileWritevBufferFunc *writev_buffer;
     QEMURetPathFunc *get_return_path;
     QEMUFileShutdownFunc *shut_down;
+    QEMUFileEnableBufferedFunc *enable_buffered;
 } QEMUFileOps;
 
 typedef struct QEMUFileHooks {