Patchwork [RFC,05/20] Introduce put_vector() and get_vector to QEMUFile and qemu_fopen_ops().

login
register
mail settings
Submitter Yoshiaki Tamura
Date April 21, 2010, 5:57 a.m.
Message ID <1271829445-5328-6-git-send-email-tamura.yoshiaki@lab.ntt.co.jp>
Download mbox | patch
Permalink /patch/50624/
State New
Headers show

Comments

Yoshiaki Tamura - April 21, 2010, 5:57 a.m.
QEMUFile currently doesn't support writev().  For sending multiple
data, such as pages, using writev() should be more efficient.

Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
---
 buffered_file.c |    2 +-
 hw/hw.h         |   16 ++++++++++++++++
 savevm.c        |   43 +++++++++++++++++++++++++------------------
 3 files changed, 42 insertions(+), 19 deletions(-)
Anthony Liguori - April 22, 2010, 7:28 p.m.
On 04/21/2010 12:57 AM, Yoshiaki Tamura wrote:
> QEMUFile currently doesn't support writev().  For sending multiple
> data, such as pages, using writev() should be more efficient.
>
> Signed-off-by: Yoshiaki Tamura<tamura.yoshiaki@lab.ntt.co.jp>
>    

Is there performance data that backs this up?  Since QEMUFile uses a 
linear buffer for most operations that's limited to 16k, I suspect you 
wouldn't be able to observe a difference in practice.

Regards,

Anthony Liguori

> ---
>   buffered_file.c |    2 +-
>   hw/hw.h         |   16 ++++++++++++++++
>   savevm.c        |   43 +++++++++++++++++++++++++------------------
>   3 files changed, 42 insertions(+), 19 deletions(-)
>
> diff --git a/buffered_file.c b/buffered_file.c
> index 54dc6c2..187d1d4 100644
> --- a/buffered_file.c
> +++ b/buffered_file.c
> @@ -256,7 +256,7 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
>       s->wait_for_unfreeze = wait_for_unfreeze;
>       s->close = close;
>
> -    s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
> +    s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL, NULL, NULL,
>                                buffered_close, buffered_rate_limit,
>                                buffered_set_rate_limit,
>   			     buffered_get_rate_limit);
> diff --git a/hw/hw.h b/hw/hw.h
> index fc9ed29..921cf90 100644
> --- a/hw/hw.h
> +++ b/hw/hw.h
> @@ -23,6 +23,13 @@
>   typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
>                                       int64_t pos, int size);
>
> +/* This function writes a chunk of vector to a file at the given position.
> + * The pos argument can be ignored if the file is only being used for
> + * streaming.
> + */
> +typedef int (QEMUFilePutVectorFunc)(void *opaque, struct iovec *iov,
> +                                    int64_t pos, int iovcnt);
> +
>   /* Read a chunk of data from a file at the given position.  The pos argument
>    * can be ignored if the file is only be used for streaming.  The number of
>    * bytes actually read should be returned.
> @@ -30,6 +37,13 @@ typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
>   typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
>                                       int64_t pos, int size);
>
> +/* Read a chunk of vector from a file at the given position.  The pos argument
> + * can be ignored if the file is only be used for streaming.  The number of
> + * bytes actually read should be returned.
> + */
> +typedef int (QEMUFileGetVectorFunc)(void *opaque, struct iovec *iov,
> +                                    int64_t pos, int iovcnt);
> +
>   /* Close a file and return an error code */
>   typedef int (QEMUFileCloseFunc)(void *opaque);
>
> @@ -46,7 +60,9 @@ typedef size_t (QEMUFileSetRateLimit)(void *opaque, size_t new_rate);
>   typedef size_t (QEMUFileGetRateLimit)(void *opaque);
>
>   QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
> +                         QEMUFilePutVectorFunc *put_vector,
>                            QEMUFileGetBufferFunc *get_buffer,
> +                         QEMUFileGetVectorFunc *get_vector,
>                            QEMUFileCloseFunc *close,
>                            QEMUFileRateLimit *rate_limit,
>                            QEMUFileSetRateLimit *set_rate_limit,
> diff --git a/savevm.c b/savevm.c
> index 490ab70..944e788 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -162,7 +162,9 @@ void qemu_announce_self(void)
>
>   struct QEMUFile {
>       QEMUFilePutBufferFunc *put_buffer;
> +    QEMUFilePutVectorFunc *put_vector;
>       QEMUFileGetBufferFunc *get_buffer;
> +    QEMUFileGetVectorFunc *get_vector;
>       QEMUFileCloseFunc *close;
>       QEMUFileRateLimit *rate_limit;
>       QEMUFileSetRateLimit *set_rate_limit;
> @@ -263,11 +265,11 @@ QEMUFile *qemu_popen(FILE *stdio_file, const char *mode)
>       s->stdio_file = stdio_file;
>
>       if(mode[0] == 'r') {
> -        s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_pclose,
> -				 NULL, NULL, NULL);
> +        s->file = qemu_fopen_ops(s, NULL, NULL, stdio_get_buffer,
> +                 NULL, stdio_pclose, NULL, NULL, NULL);
>       } else {
> -        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_pclose,
> -				 NULL, NULL, NULL);
> +        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, NULL, NULL,
> +                 stdio_pclose, NULL, NULL, NULL);
>       }
>       return s->file;
>   }
> @@ -312,11 +314,11 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>           goto fail;
>
>       if(mode[0] == 'r') {
> -        s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_fclose,
> -				 NULL, NULL, NULL);
> +        s->file = qemu_fopen_ops(s, NULL, NULL, stdio_get_buffer, NULL,
> +                 stdio_fclose, NULL, NULL, NULL);
>       } else {
> -        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_fclose,
> -				 NULL, NULL, NULL);
> +        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, NULL, NULL,
> +                 stdio_fclose, NULL, NULL, NULL);
>       }
>       return s->file;
>
> @@ -330,8 +332,8 @@ QEMUFile *qemu_fopen_socket(int fd)
>       QEMUFileSocket *s = qemu_mallocz(sizeof(QEMUFileSocket));
>
>       s->fd = fd;
> -    s->file = qemu_fopen_ops(s, NULL, socket_get_buffer, socket_close,
> -			     NULL, NULL, NULL);
> +    s->file = qemu_fopen_ops(s, NULL, NULL, socket_get_buffer, NULL,
> +                             socket_close, NULL, NULL, NULL);
>       return s->file;
>   }
>
> @@ -368,11 +370,11 @@ QEMUFile *qemu_fopen(const char *filename, const char *mode)
>           goto fail;
>
>       if(mode[0] == 'w') {
> -        s->file = qemu_fopen_ops(s, file_put_buffer, NULL, stdio_fclose,
> -				 NULL, NULL, NULL);
> +        s->file = qemu_fopen_ops(s, file_put_buffer, NULL, NULL, NULL,
> +                  stdio_fclose, NULL, NULL, NULL);
>       } else {
> -        s->file = qemu_fopen_ops(s, NULL, file_get_buffer, stdio_fclose,
> -			       NULL, NULL, NULL);
> +        s->file = qemu_fopen_ops(s, NULL, NULL, file_get_buffer, NULL,
> +                  stdio_fclose, NULL, NULL, NULL);
>       }
>       return s->file;
>   fail:
> @@ -400,13 +402,16 @@ static int bdrv_fclose(void *opaque)
>   static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
>   {
>       if (is_writable)
> -        return qemu_fopen_ops(bs, block_put_buffer, NULL, bdrv_fclose,
> -			      NULL, NULL, NULL);
> -    return qemu_fopen_ops(bs, NULL, block_get_buffer, bdrv_fclose, NULL, NULL, NULL);
> +        return qemu_fopen_ops(bs, block_put_buffer, NULL, NULL, NULL,
> +                  bdrv_fclose, NULL, NULL, NULL);
> +    return qemu_fopen_ops(bs, NULL, NULL, block_get_buffer, NULL, bdrv_fclose, NULL, NULL, NULL);
>   }
>
> -QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
> +QEMUFile *qemu_fopen_ops(void *opaque,
> +                         QEMUFilePutBufferFunc *put_buffer,
> +                         QEMUFilePutVectorFunc *put_vector,
>                            QEMUFileGetBufferFunc *get_buffer,
> +                         QEMUFileGetVectorFunc *get_vector,
>                            QEMUFileCloseFunc *close,
>                            QEMUFileRateLimit *rate_limit,
>                            QEMUFileSetRateLimit *set_rate_limit,
> @@ -418,7 +423,9 @@ QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
>
>       f->opaque = opaque;
>       f->put_buffer = put_buffer;
> +    f->put_vector = put_vector;
>       f->get_buffer = get_buffer;
> +    f->get_vector = get_vector;
>       f->close = close;
>       f->rate_limit = rate_limit;
>       f->set_rate_limit = set_rate_limit;
>
Yoshiaki Tamura - April 23, 2010, 3:37 a.m.
Anthony Liguori wrote:
> On 04/21/2010 12:57 AM, Yoshiaki Tamura wrote:
>> QEMUFile currently doesn't support writev(). For sending multiple
>> data, such as pages, using writev() should be more efficient.
>>
>> Signed-off-by: Yoshiaki Tamura<tamura.yoshiaki@lab.ntt.co.jp>
>
> Is there performance data that backs this up? Since QEMUFile uses a
> linear buffer for most operations that's limited to 16k, I suspect you
> wouldn't be able to observe a difference in practice.

I currently don't have data, but I'll prepare it.
There were two things I wanted to avoid.

1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
2. Calling write() everytime even when we want to send multiple pages at once.

I think 2 may be neglectable.
But 1 seems to be problematic if we want make to the latency as small as 
possible, no?

>
> Regards,
>
> Anthony Liguori
>
>> ---
>> buffered_file.c | 2 +-
>> hw/hw.h | 16 ++++++++++++++++
>> savevm.c | 43 +++++++++++++++++++++++++------------------
>> 3 files changed, 42 insertions(+), 19 deletions(-)
>>
>> diff --git a/buffered_file.c b/buffered_file.c
>> index 54dc6c2..187d1d4 100644
>> --- a/buffered_file.c
>> +++ b/buffered_file.c
>> @@ -256,7 +256,7 @@ QEMUFile *qemu_fopen_ops_buffered(void *opaque,
>> s->wait_for_unfreeze = wait_for_unfreeze;
>> s->close = close;
>>
>> - s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
>> + s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL, NULL, NULL,
>> buffered_close, buffered_rate_limit,
>> buffered_set_rate_limit,
>> buffered_get_rate_limit);
>> diff --git a/hw/hw.h b/hw/hw.h
>> index fc9ed29..921cf90 100644
>> --- a/hw/hw.h
>> +++ b/hw/hw.h
>> @@ -23,6 +23,13 @@
>> typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
>> int64_t pos, int size);
>>
>> +/* This function writes a chunk of vector to a file at the given
>> position.
>> + * The pos argument can be ignored if the file is only being used for
>> + * streaming.
>> + */
>> +typedef int (QEMUFilePutVectorFunc)(void *opaque, struct iovec *iov,
>> + int64_t pos, int iovcnt);
>> +
>> /* Read a chunk of data from a file at the given position. The pos
>> argument
>> * can be ignored if the file is only be used for streaming. The number of
>> * bytes actually read should be returned.
>> @@ -30,6 +37,13 @@ typedef int (QEMUFilePutBufferFunc)(void *opaque,
>> const uint8_t *buf,
>> typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
>> int64_t pos, int size);
>>
>> +/* Read a chunk of vector from a file at the given position. The pos
>> argument
>> + * can be ignored if the file is only be used for streaming. The
>> number of
>> + * bytes actually read should be returned.
>> + */
>> +typedef int (QEMUFileGetVectorFunc)(void *opaque, struct iovec *iov,
>> + int64_t pos, int iovcnt);
>> +
>> /* Close a file and return an error code */
>> typedef int (QEMUFileCloseFunc)(void *opaque);
>>
>> @@ -46,7 +60,9 @@ typedef size_t (QEMUFileSetRateLimit)(void *opaque,
>> size_t new_rate);
>> typedef size_t (QEMUFileGetRateLimit)(void *opaque);
>>
>> QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
>> + QEMUFilePutVectorFunc *put_vector,
>> QEMUFileGetBufferFunc *get_buffer,
>> + QEMUFileGetVectorFunc *get_vector,
>> QEMUFileCloseFunc *close,
>> QEMUFileRateLimit *rate_limit,
>> QEMUFileSetRateLimit *set_rate_limit,
>> diff --git a/savevm.c b/savevm.c
>> index 490ab70..944e788 100644
>> --- a/savevm.c
>> +++ b/savevm.c
>> @@ -162,7 +162,9 @@ void qemu_announce_self(void)
>>
>> struct QEMUFile {
>> QEMUFilePutBufferFunc *put_buffer;
>> + QEMUFilePutVectorFunc *put_vector;
>> QEMUFileGetBufferFunc *get_buffer;
>> + QEMUFileGetVectorFunc *get_vector;
>> QEMUFileCloseFunc *close;
>> QEMUFileRateLimit *rate_limit;
>> QEMUFileSetRateLimit *set_rate_limit;
>> @@ -263,11 +265,11 @@ QEMUFile *qemu_popen(FILE *stdio_file, const
>> char *mode)
>> s->stdio_file = stdio_file;
>>
>> if(mode[0] == 'r') {
>> - s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_pclose,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, NULL, NULL, stdio_get_buffer,
>> + NULL, stdio_pclose, NULL, NULL, NULL);
>> } else {
>> - s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_pclose,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, NULL, NULL,
>> + stdio_pclose, NULL, NULL, NULL);
>> }
>> return s->file;
>> }
>> @@ -312,11 +314,11 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
>> goto fail;
>>
>> if(mode[0] == 'r') {
>> - s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_fclose,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, NULL, NULL, stdio_get_buffer, NULL,
>> + stdio_fclose, NULL, NULL, NULL);
>> } else {
>> - s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_fclose,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, NULL, NULL,
>> + stdio_fclose, NULL, NULL, NULL);
>> }
>> return s->file;
>>
>> @@ -330,8 +332,8 @@ QEMUFile *qemu_fopen_socket(int fd)
>> QEMUFileSocket *s = qemu_mallocz(sizeof(QEMUFileSocket));
>>
>> s->fd = fd;
>> - s->file = qemu_fopen_ops(s, NULL, socket_get_buffer, socket_close,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, NULL, NULL, socket_get_buffer, NULL,
>> + socket_close, NULL, NULL, NULL);
>> return s->file;
>> }
>>
>> @@ -368,11 +370,11 @@ QEMUFile *qemu_fopen(const char *filename, const
>> char *mode)
>> goto fail;
>>
>> if(mode[0] == 'w') {
>> - s->file = qemu_fopen_ops(s, file_put_buffer, NULL, stdio_fclose,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, file_put_buffer, NULL, NULL, NULL,
>> + stdio_fclose, NULL, NULL, NULL);
>> } else {
>> - s->file = qemu_fopen_ops(s, NULL, file_get_buffer, stdio_fclose,
>> - NULL, NULL, NULL);
>> + s->file = qemu_fopen_ops(s, NULL, NULL, file_get_buffer, NULL,
>> + stdio_fclose, NULL, NULL, NULL);
>> }
>> return s->file;
>> fail:
>> @@ -400,13 +402,16 @@ static int bdrv_fclose(void *opaque)
>> static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
>> {
>> if (is_writable)
>> - return qemu_fopen_ops(bs, block_put_buffer, NULL, bdrv_fclose,
>> - NULL, NULL, NULL);
>> - return qemu_fopen_ops(bs, NULL, block_get_buffer, bdrv_fclose, NULL,
>> NULL, NULL);
>> + return qemu_fopen_ops(bs, block_put_buffer, NULL, NULL, NULL,
>> + bdrv_fclose, NULL, NULL, NULL);
>> + return qemu_fopen_ops(bs, NULL, NULL, block_get_buffer, NULL,
>> bdrv_fclose, NULL, NULL, NULL);
>> }
>>
>> -QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc
>> *put_buffer,
>> +QEMUFile *qemu_fopen_ops(void *opaque,
>> + QEMUFilePutBufferFunc *put_buffer,
>> + QEMUFilePutVectorFunc *put_vector,
>> QEMUFileGetBufferFunc *get_buffer,
>> + QEMUFileGetVectorFunc *get_vector,
>> QEMUFileCloseFunc *close,
>> QEMUFileRateLimit *rate_limit,
>> QEMUFileSetRateLimit *set_rate_limit,
>> @@ -418,7 +423,9 @@ QEMUFile *qemu_fopen_ops(void *opaque,
>> QEMUFilePutBufferFunc *put_buffer,
>>
>> f->opaque = opaque;
>> f->put_buffer = put_buffer;
>> + f->put_vector = put_vector;
>> f->get_buffer = get_buffer;
>> + f->get_vector = get_vector;
>> f->close = close;
>> f->rate_limit = rate_limit;
>> f->set_rate_limit = set_rate_limit;
>
>
>
>
Anthony Liguori - April 23, 2010, 1:22 p.m.
On 04/22/2010 10:37 PM, Yoshiaki Tamura wrote:
> Anthony Liguori wrote:
>> On 04/21/2010 12:57 AM, Yoshiaki Tamura wrote:
>>> QEMUFile currently doesn't support writev(). For sending multiple
>>> data, such as pages, using writev() should be more efficient.
>>>
>>> Signed-off-by: Yoshiaki Tamura<tamura.yoshiaki@lab.ntt.co.jp>
>>
>> Is there performance data that backs this up? Since QEMUFile uses a
>> linear buffer for most operations that's limited to 16k, I suspect you
>> wouldn't be able to observe a difference in practice.
>
> I currently don't have data, but I'll prepare it.
> There were two things I wanted to avoid.
>
> 1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
> 2. Calling write() everytime even when we want to send multiple pages 
> at once.
>
> I think 2 may be neglectable.
> But 1 seems to be problematic if we want make to the latency as small 
> as possible, no?

Copying often has strange CPU characteristics depending on whether the 
data is already in cache.  It's better to drive these sort of 
optimizations through performance measurement because changes are not 
always obvious.

Regards,

Anthony Liguori
Avi Kivity - April 23, 2010, 1:48 p.m.
On 04/23/2010 04:22 PM, Anthony Liguori wrote:
>> I currently don't have data, but I'll prepare it.
>> There were two things I wanted to avoid.
>>
>> 1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
>> 2. Calling write() everytime even when we want to send multiple pages 
>> at once.
>>
>> I think 2 may be neglectable.
>> But 1 seems to be problematic if we want make to the latency as small 
>> as possible, no?
>
>
> Copying often has strange CPU characteristics depending on whether the 
> data is already in cache.  It's better to drive these sort of 
> optimizations through performance measurement because changes are not 
> always obvious.

Copying always introduces more cache pollution, so even if the data is 
in the cache, it is worthwhile (not disagreeing with the need to measure).
Yoshiaki Tamura - April 26, 2010, 10:43 a.m.
Anthony Liguori wrote:
> On 04/22/2010 10:37 PM, Yoshiaki Tamura wrote:
>> Anthony Liguori wrote:
>>> On 04/21/2010 12:57 AM, Yoshiaki Tamura wrote:
>>>> QEMUFile currently doesn't support writev(). For sending multiple
>>>> data, such as pages, using writev() should be more efficient.
>>>>
>>>> Signed-off-by: Yoshiaki Tamura<tamura.yoshiaki@lab.ntt.co.jp>
>>>
>>> Is there performance data that backs this up? Since QEMUFile uses a
>>> linear buffer for most operations that's limited to 16k, I suspect you
>>> wouldn't be able to observe a difference in practice.
>>
>> I currently don't have data, but I'll prepare it.
>> There were two things I wanted to avoid.
>>
>> 1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
>> 2. Calling write() everytime even when we want to send multiple pages
>> at once.
>>
>> I think 2 may be neglectable.
>> But 1 seems to be problematic if we want make to the latency as small
>> as possible, no?
>
> Copying often has strange CPU characteristics depending on whether the
> data is already in cache. It's better to drive these sort of
> optimizations through performance measurement because changes are not
> always obvious.

I agree.
Yoshiaki Tamura - May 3, 2010, 9:32 a.m.
2010/4/23 Avi Kivity <avi@redhat.com>:
> On 04/23/2010 04:22 PM, Anthony Liguori wrote:
>>>
>>> I currently don't have data, but I'll prepare it.
>>> There were two things I wanted to avoid.
>>>
>>> 1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
>>> 2. Calling write() everytime even when we want to send multiple pages at
>>> once.
>>>
>>> I think 2 may be neglectable.
>>> But 1 seems to be problematic if we want make to the latency as small as
>>> possible, no?
>>
>>
>> Copying often has strange CPU characteristics depending on whether the
>> data is already in cache.  It's better to drive these sort of optimizations
>> through performance measurement because changes are not always obvious.
>
> Copying always introduces more cache pollution, so even if the data is in
> the cache, it is worthwhile (not disagreeing with the need to measure).

Anthony,

I measure how long it takes to send all guest pages during migration, and I
would like to share the information in this message.  For convenience,
I modified
the code to do migration not "live migration" which means buffered file is not
used here.

In summary, the performance improvement using writev instead of write/send when
we used GbE seems to be neglectable, however, when the underlying network was
fast (InfiniBand with IPoIB in this case), writev performed 17% faster than
write/send, and therefore, it may be worthwhile to introduce vectors.

Since QEMU compresses pages, I copied a junk file to tmpfs to dirty pages to let
QEMU to transfer fine number of pages.  After setting up the guest, I used
cpu_get_real_ticks() to measure the time during the while loop calling
ram_save_block() in ram_save_live().  I removed the qemu_file_rate_limit() to
disable the function of buffered file, and all of the pages would be transfered
at the first round.

I measure 10 times for each, and took average and standard deviation.
Considering the results, I think the trial number was enough.  In addition to
time duration, number of writev/write and number of pages which were compressed
(dup)/not compressed (nodup) are demonstrated.

Test Environment:
CPU: 2x Intel Xeon Dual Core 3GHz
Mem size: 6GB
Network: GbE, InfiniBand (IPoIB)

Host OS: Fedora 11 (kernel 2.6.34-rc1)
Guest OS: Fedora 11 (kernel 2.6.33)
Guest Mem size: 512MB

* GbE writev
time (sec): 35.732 (std 0.002)
write count: 4 (std 0)
writev count: 8269 (std 1)
dup count: 36157 (std 124)
nodup count: 1016808 (std 147)

* GbE write
time (sec): 35.780 (std 0.164)
write count: 127367 (21)
writev count: 0 (std 0)
dup count: 36134 (std 108)
nodup count: 1016853 (std 165)

* IPoIB writev
time (sec): 13.889 (std 0.155)
write count: 4 (std 0)
writev count: 8267 (std 1)
dup count: 36147 (std 105)
nodup count: 1016838 (std 111)

* IPoIB write
time (sec): 16.777 (std 0.239)
write count: 127364 (24)
writev count: 0 (std 0)
dup count: 36173 (std 169)
nodup count: 1016840 (std 190)

Although the improvement wasn't obvious when the network wan GbE, introducing
writev may be worthwhile when we focus on faster networks like InfiniBand/10GE.

I agree that separating this optimization from the main logic of Kemari since
this modification must be done widely and carefully at the same time.

Thanks,

Yoshi
Anthony Liguori - May 3, 2010, 12:05 p.m.
On 05/03/2010 04:32 AM, Yoshiaki Tamura wrote:
> 2010/4/23 Avi Kivity<avi@redhat.com>:
>    
>> On 04/23/2010 04:22 PM, Anthony Liguori wrote:
>>      
>>>> I currently don't have data, but I'll prepare it.
>>>> There were two things I wanted to avoid.
>>>>
>>>> 1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
>>>> 2. Calling write() everytime even when we want to send multiple pages at
>>>> once.
>>>>
>>>> I think 2 may be neglectable.
>>>> But 1 seems to be problematic if we want make to the latency as small as
>>>> possible, no?
>>>>          
>>>
>>> Copying often has strange CPU characteristics depending on whether the
>>> data is already in cache.  It's better to drive these sort of optimizations
>>> through performance measurement because changes are not always obvious.
>>>        
>> Copying always introduces more cache pollution, so even if the data is in
>> the cache, it is worthwhile (not disagreeing with the need to measure).
>>      
> Anthony,
>
> I measure how long it takes to send all guest pages during migration, and I
> would like to share the information in this message.  For convenience,
> I modified
> the code to do migration not "live migration" which means buffered file is not
> used here.
>
> In summary, the performance improvement using writev instead of write/send when
> we used GbE seems to be neglectable, however, when the underlying network was
> fast (InfiniBand with IPoIB in this case), writev performed 17% faster than
> write/send, and therefore, it may be worthwhile to introduce vectors.
>
> Since QEMU compresses pages, I copied a junk file to tmpfs to dirty pages to let
> QEMU to transfer fine number of pages.  After setting up the guest, I used
> cpu_get_real_ticks() to measure the time during the while loop calling
> ram_save_block() in ram_save_live().  I removed the qemu_file_rate_limit() to
> disable the function of buffered file, and all of the pages would be transfered
> at the first round.
>
> I measure 10 times for each, and took average and standard deviation.
> Considering the results, I think the trial number was enough.  In addition to
> time duration, number of writev/write and number of pages which were compressed
> (dup)/not compressed (nodup) are demonstrated.
>
> Test Environment:
> CPU: 2x Intel Xeon Dual Core 3GHz
> Mem size: 6GB
> Network: GbE, InfiniBand (IPoIB)
>
> Host OS: Fedora 11 (kernel 2.6.34-rc1)
> Guest OS: Fedora 11 (kernel 2.6.33)
> Guest Mem size: 512MB
>
> * GbE writev
> time (sec): 35.732 (std 0.002)
> write count: 4 (std 0)
> writev count: 8269 (std 1)
> dup count: 36157 (std 124)
> nodup count: 1016808 (std 147)
>
> * GbE write
> time (sec): 35.780 (std 0.164)
> write count: 127367 (21)
> writev count: 0 (std 0)
> dup count: 36134 (std 108)
> nodup count: 1016853 (std 165)
>
> * IPoIB writev
> time (sec): 13.889 (std 0.155)
> write count: 4 (std 0)
> writev count: 8267 (std 1)
> dup count: 36147 (std 105)
> nodup count: 1016838 (std 111)
>
> * IPoIB write
> time (sec): 16.777 (std 0.239)
> write count: 127364 (24)
> writev count: 0 (std 0)
> dup count: 36173 (std 169)
> nodup count: 1016840 (std 190)
>
> Although the improvement wasn't obvious when the network wan GbE, introducing
> writev may be worthwhile when we focus on faster networks like InfiniBand/10GE.
>
> I agree that separating this optimization from the main logic of Kemari since
> this modification must be done widely and carefully at the same time.
>    

Okay.  It looks like it's clear that it's a win so let's split it out of 
the main series and we'll treat it separately.  I imagine we'll see even 
more positive results on 10 gbit and particularly if we move migration 
out into a separate thread.

Regards,

Anthony Liguori

> Thanks,
>
> Yoshi
>
Yoshiaki Tamura - May 3, 2010, 3:36 p.m.
2010/5/3 Anthony Liguori <aliguori@linux.vnet.ibm.com>:
> On 05/03/2010 04:32 AM, Yoshiaki Tamura wrote:
>>
>> 2010/4/23 Avi Kivity<avi@redhat.com>:
>>
>>>
>>> On 04/23/2010 04:22 PM, Anthony Liguori wrote:
>>>
>>>>>
>>>>> I currently don't have data, but I'll prepare it.
>>>>> There were two things I wanted to avoid.
>>>>>
>>>>> 1. Pages to be copied to QEMUFile buf through qemu_put_buffer.
>>>>> 2. Calling write() everytime even when we want to send multiple pages
>>>>> at
>>>>> once.
>>>>>
>>>>> I think 2 may be neglectable.
>>>>> But 1 seems to be problematic if we want make to the latency as small
>>>>> as
>>>>> possible, no?
>>>>>
>>>>
>>>> Copying often has strange CPU characteristics depending on whether the
>>>> data is already in cache.  It's better to drive these sort of
>>>> optimizations
>>>> through performance measurement because changes are not always obvious.
>>>>
>>>
>>> Copying always introduces more cache pollution, so even if the data is in
>>> the cache, it is worthwhile (not disagreeing with the need to measure).
>>>
>>
>> Anthony,
>>
>> I measure how long it takes to send all guest pages during migration, and
>> I
>> would like to share the information in this message.  For convenience,
>> I modified
>> the code to do migration not "live migration" which means buffered file is
>> not
>> used here.
>>
>> In summary, the performance improvement using writev instead of write/send
>> when
>> we used GbE seems to be neglectable, however, when the underlying network
>> was
>> fast (InfiniBand with IPoIB in this case), writev performed 17% faster
>> than
>> write/send, and therefore, it may be worthwhile to introduce vectors.
>>
>> Since QEMU compresses pages, I copied a junk file to tmpfs to dirty pages
>> to let
>> QEMU to transfer fine number of pages.  After setting up the guest, I used
>> cpu_get_real_ticks() to measure the time during the while loop calling
>> ram_save_block() in ram_save_live().  I removed the qemu_file_rate_limit()
>> to
>> disable the function of buffered file, and all of the pages would be
>> transfered
>> at the first round.
>>
>> I measure 10 times for each, and took average and standard deviation.
>> Considering the results, I think the trial number was enough.  In addition
>> to
>> time duration, number of writev/write and number of pages which were
>> compressed
>> (dup)/not compressed (nodup) are demonstrated.
>>
>> Test Environment:
>> CPU: 2x Intel Xeon Dual Core 3GHz
>> Mem size: 6GB
>> Network: GbE, InfiniBand (IPoIB)
>>
>> Host OS: Fedora 11 (kernel 2.6.34-rc1)
>> Guest OS: Fedora 11 (kernel 2.6.33)
>> Guest Mem size: 512MB
>>
>> * GbE writev
>> time (sec): 35.732 (std 0.002)
>> write count: 4 (std 0)
>> writev count: 8269 (std 1)
>> dup count: 36157 (std 124)
>> nodup count: 1016808 (std 147)
>>
>> * GbE write
>> time (sec): 35.780 (std 0.164)
>> write count: 127367 (21)
>> writev count: 0 (std 0)
>> dup count: 36134 (std 108)
>> nodup count: 1016853 (std 165)
>>
>> * IPoIB writev
>> time (sec): 13.889 (std 0.155)
>> write count: 4 (std 0)
>> writev count: 8267 (std 1)
>> dup count: 36147 (std 105)
>> nodup count: 1016838 (std 111)
>>
>> * IPoIB write
>> time (sec): 16.777 (std 0.239)
>> write count: 127364 (24)
>> writev count: 0 (std 0)
>> dup count: 36173 (std 169)
>> nodup count: 1016840 (std 190)
>>
>> Although the improvement wasn't obvious when the network wan GbE,
>> introducing
>> writev may be worthwhile when we focus on faster networks like
>> InfiniBand/10GE.
>>
>> I agree that separating this optimization from the main logic of Kemari
>> since
>> this modification must be done widely and carefully at the same time.
>>
>
> Okay.  It looks like it's clear that it's a win so let's split it out of the
> main series and we'll treat it separately.  I imagine we'll see even more
> positive results on 10 gbit and particularly if we move migration out into a
> separate thread.

Great!
I also wanted to test with 10GE but I'm physically away from my office
now, and can't set up the test environment.  I'll measure the numbers
w/ 10GE next week.

BTW, I was thinking to write a patch to separate threads for both
sender and receiver of migration.  Kemari especially needs a separate
thread receiver, so that monitor can accepts commands from other HA
tools.  Is someone already working on this?  If not, I would add it to
my task list :-)

Thanks,

Yoshi

>
> Regards,
>
> Anthony Liguori
>
>> Thanks,
>>
>> Yoshi
Anthony Liguori - May 3, 2010, 4:07 p.m.
On 05/03/2010 10:36 AM, Yoshiaki Tamura wrote:
>
> Great!
> I also wanted to test with 10GE but I'm physically away from my office
> now, and can't set up the test environment.  I'll measure the numbers
> w/ 10GE next week.
>
> BTW, I was thinking to write a patch to separate threads for both
> sender and receiver of migration.  Kemari especially needs a separate
> thread receiver, so that monitor can accepts commands from other HA
> tools.  Is someone already working on this?  If not, I would add it to
> my task list :-)
>    

So far, no one (to my knowledge at least), is working on this.

Regards,

Anthony Liguori

> Thanks,
>
> Yoshi
>
>    
>> Regards,
>>
>> Anthony Liguori
>>
>>      
>>> Thanks,
>>>
>>> Yoshi
>>>

Patch

diff --git a/buffered_file.c b/buffered_file.c
index 54dc6c2..187d1d4 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -256,7 +256,7 @@  QEMUFile *qemu_fopen_ops_buffered(void *opaque,
     s->wait_for_unfreeze = wait_for_unfreeze;
     s->close = close;
 
-    s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL,
+    s->file = qemu_fopen_ops(s, buffered_put_buffer, NULL, NULL, NULL,
                              buffered_close, buffered_rate_limit,
                              buffered_set_rate_limit,
 			     buffered_get_rate_limit);
diff --git a/hw/hw.h b/hw/hw.h
index fc9ed29..921cf90 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -23,6 +23,13 @@ 
 typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
                                     int64_t pos, int size);
 
+/* This function writes a chunk of vector to a file at the given position.
+ * The pos argument can be ignored if the file is only being used for
+ * streaming.
+ */
+typedef int (QEMUFilePutVectorFunc)(void *opaque, struct iovec *iov,
+                                    int64_t pos, int iovcnt);
+
 /* Read a chunk of data from a file at the given position.  The pos argument
  * can be ignored if the file is only be used for streaming.  The number of
  * bytes actually read should be returned.
@@ -30,6 +37,13 @@  typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf,
 typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf,
                                     int64_t pos, int size);
 
+/* Read a chunk of vector from a file at the given position.  The pos argument
+ * can be ignored if the file is only be used for streaming.  The number of
+ * bytes actually read should be returned.
+ */
+typedef int (QEMUFileGetVectorFunc)(void *opaque, struct iovec *iov,
+                                    int64_t pos, int iovcnt);
+
 /* Close a file and return an error code */
 typedef int (QEMUFileCloseFunc)(void *opaque);
 
@@ -46,7 +60,9 @@  typedef size_t (QEMUFileSetRateLimit)(void *opaque, size_t new_rate);
 typedef size_t (QEMUFileGetRateLimit)(void *opaque);
 
 QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
+                         QEMUFilePutVectorFunc *put_vector,
                          QEMUFileGetBufferFunc *get_buffer,
+                         QEMUFileGetVectorFunc *get_vector,
                          QEMUFileCloseFunc *close,
                          QEMUFileRateLimit *rate_limit,
                          QEMUFileSetRateLimit *set_rate_limit,
diff --git a/savevm.c b/savevm.c
index 490ab70..944e788 100644
--- a/savevm.c
+++ b/savevm.c
@@ -162,7 +162,9 @@  void qemu_announce_self(void)
 
 struct QEMUFile {
     QEMUFilePutBufferFunc *put_buffer;
+    QEMUFilePutVectorFunc *put_vector;
     QEMUFileGetBufferFunc *get_buffer;
+    QEMUFileGetVectorFunc *get_vector;
     QEMUFileCloseFunc *close;
     QEMUFileRateLimit *rate_limit;
     QEMUFileSetRateLimit *set_rate_limit;
@@ -263,11 +265,11 @@  QEMUFile *qemu_popen(FILE *stdio_file, const char *mode)
     s->stdio_file = stdio_file;
 
     if(mode[0] == 'r') {
-        s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_pclose, 
-				 NULL, NULL, NULL);
+        s->file = qemu_fopen_ops(s, NULL, NULL, stdio_get_buffer,
+                 NULL, stdio_pclose, NULL, NULL, NULL);
     } else {
-        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_pclose, 
-				 NULL, NULL, NULL);
+        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, NULL, NULL, 
+                 stdio_pclose, NULL, NULL, NULL);
     }
     return s->file;
 }
@@ -312,11 +314,11 @@  QEMUFile *qemu_fdopen(int fd, const char *mode)
         goto fail;
 
     if(mode[0] == 'r') {
-        s->file = qemu_fopen_ops(s, NULL, stdio_get_buffer, stdio_fclose, 
-				 NULL, NULL, NULL);
+        s->file = qemu_fopen_ops(s, NULL, NULL, stdio_get_buffer, NULL,
+                 stdio_fclose, NULL, NULL, NULL);
     } else {
-        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, stdio_fclose, 
-				 NULL, NULL, NULL);
+        s->file = qemu_fopen_ops(s, stdio_put_buffer, NULL, NULL, NULL,
+                 stdio_fclose, NULL, NULL, NULL);
     }
     return s->file;
 
@@ -330,8 +332,8 @@  QEMUFile *qemu_fopen_socket(int fd)
     QEMUFileSocket *s = qemu_mallocz(sizeof(QEMUFileSocket));
 
     s->fd = fd;
-    s->file = qemu_fopen_ops(s, NULL, socket_get_buffer, socket_close, 
-			     NULL, NULL, NULL);
+    s->file = qemu_fopen_ops(s, NULL, NULL, socket_get_buffer, NULL,
+                             socket_close, NULL, NULL, NULL);
     return s->file;
 }
 
@@ -368,11 +370,11 @@  QEMUFile *qemu_fopen(const char *filename, const char *mode)
         goto fail;
     
     if(mode[0] == 'w') {
-        s->file = qemu_fopen_ops(s, file_put_buffer, NULL, stdio_fclose, 
-				 NULL, NULL, NULL);
+        s->file = qemu_fopen_ops(s, file_put_buffer, NULL, NULL, NULL,
+                  stdio_fclose, NULL, NULL, NULL);
     } else {
-        s->file = qemu_fopen_ops(s, NULL, file_get_buffer, stdio_fclose, 
-			       NULL, NULL, NULL);
+        s->file = qemu_fopen_ops(s, NULL, NULL, file_get_buffer, NULL,
+                  stdio_fclose, NULL, NULL, NULL);
     }
     return s->file;
 fail:
@@ -400,13 +402,16 @@  static int bdrv_fclose(void *opaque)
 static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 {
     if (is_writable)
-        return qemu_fopen_ops(bs, block_put_buffer, NULL, bdrv_fclose, 
-			      NULL, NULL, NULL);
-    return qemu_fopen_ops(bs, NULL, block_get_buffer, bdrv_fclose, NULL, NULL, NULL);
+        return qemu_fopen_ops(bs, block_put_buffer, NULL, NULL, NULL,
+                  bdrv_fclose, NULL, NULL, NULL);
+    return qemu_fopen_ops(bs, NULL, NULL, block_get_buffer, NULL, bdrv_fclose, NULL, NULL, NULL);
 }
 
-QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
+QEMUFile *qemu_fopen_ops(void *opaque,
+                         QEMUFilePutBufferFunc *put_buffer,
+                         QEMUFilePutVectorFunc *put_vector,
                          QEMUFileGetBufferFunc *get_buffer,
+                         QEMUFileGetVectorFunc *get_vector,
                          QEMUFileCloseFunc *close,
                          QEMUFileRateLimit *rate_limit,
                          QEMUFileSetRateLimit *set_rate_limit,
@@ -418,7 +423,9 @@  QEMUFile *qemu_fopen_ops(void *opaque, QEMUFilePutBufferFunc *put_buffer,
 
     f->opaque = opaque;
     f->put_buffer = put_buffer;
+    f->put_vector = put_vector;
     f->get_buffer = get_buffer;
+    f->get_vector = get_vector;
     f->close = close;
     f->rate_limit = rate_limit;
     f->set_rate_limit = set_rate_limit;