diff mbox series

[v2,3/7] migration/multifd: Zero page transmission on the multifd thread.

Message ID 20240216224002.1476890-4-hao.xiang@bytedance.com
State New
Headers show
Series Introduce multifd zero page checking. | expand

Commit Message

Hao Xiang Feb. 16, 2024, 10:39 p.m. UTC
1. Implements the zero page detection and handling on the multifd
threads for non-compression, zlib and zstd compression backends.
2. Added a new value 'multifd' in ZeroPageDetection enumeration.
3. Add proper asserts to ensure pages->normal are used for normal pages
in all scenarios.

Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
---
 migration/meson.build         |  1 +
 migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
 migration/multifd-zlib.c      | 26 ++++++++++++---
 migration/multifd-zstd.c      | 25 ++++++++++++---
 migration/multifd.c           | 50 +++++++++++++++++++++++------
 migration/multifd.h           |  7 +++++
 qapi/migration.json           |  4 ++-
 7 files changed, 151 insertions(+), 21 deletions(-)
 create mode 100644 migration/multifd-zero-page.c

Comments

Richard Henderson Feb. 16, 2024, 11:49 p.m. UTC | #1
On 2/16/24 12:39, Hao Xiang wrote:
> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> +{
> +    for (int i = 0; i < p->zero_num; i++) {
> +        void *page = p->host + p->zero[i];
> +        if (!buffer_is_zero(page, p->page_size)) {
> +            memset(page, 0, p->page_size);
> +        }
> +    }
> +}

You should not check the buffer is zero here, you should just zero it.


r~
Markus Armbruster Feb. 21, 2024, 12:04 p.m. UTC | #2
Hao Xiang <hao.xiang@bytedance.com> writes:

> 1. Implements the zero page detection and handling on the multifd
> threads for non-compression, zlib and zstd compression backends.
> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> 3. Add proper asserts to ensure pages->normal are used for normal pages
> in all scenarios.
>
> Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>

[...]

> diff --git a/qapi/migration.json b/qapi/migration.json
> index 99843a8e95..e2450b92d4 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -660,9 +660,11 @@
>  #
>  # @none: Do not perform zero page checking.
>  #
> +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)

As pointed out in my review of PATCH 3, the entire type is since 9.0.

> +#
>  ##
>  { 'enum': 'ZeroPageDetection',
> -  'data': [ 'legacy', 'none' ] }
> +  'data': [ 'legacy', 'none', 'multifd' ] }
>  
>  ##
>  # @BitmapMigrationBitmapAliasTransform:

I don't like having 'none' (don't detect) between the two ways to
detect.  Put it either first or last.
Elena Ufimtseva Feb. 21, 2024, 4 p.m. UTC | #3
On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang <hao.xiang@bytedance.com> wrote:

> 1. Implements the zero page detection and handling on the multifd
> threads for non-compression, zlib and zstd compression backends.
> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> 3. Add proper asserts to ensure pages->normal are used for normal pages
> in all scenarios.
>
> Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> ---
>  migration/meson.build         |  1 +
>  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
>  migration/multifd-zlib.c      | 26 ++++++++++++---
>  migration/multifd-zstd.c      | 25 ++++++++++++---
>  migration/multifd.c           | 50 +++++++++++++++++++++++------
>  migration/multifd.h           |  7 +++++
>  qapi/migration.json           |  4 ++-
>  7 files changed, 151 insertions(+), 21 deletions(-)
>  create mode 100644 migration/multifd-zero-page.c
>
> diff --git a/migration/meson.build b/migration/meson.build
> index 92b1cc4297..1eeb915ff6 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -22,6 +22,7 @@ system_ss.add(files(
>    'migration.c',
>    'multifd.c',
>    'multifd-zlib.c',
> +  'multifd-zero-page.c',
>    'ram-compress.c',
>    'options.c',
>    'postcopy-ram.c',
> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> new file mode 100644
> index 0000000000..f0cd8e2c53
> --- /dev/null
> +++ b/migration/multifd-zero-page.c
> @@ -0,0 +1,59 @@
> +/*
> + * Multifd zero page detection implementation.
> + *
> + * Copyright (c) 2024 Bytedance Inc
> + *
> + * Authors:
> + *  Hao Xiang <hao.xiang@bytedance.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> +#include "exec/ramblock.h"
> +#include "migration.h"
> +#include "multifd.h"
> +#include "options.h"
> +#include "ram.h"
> +
> +void multifd_zero_page_check_send(MultiFDSendParams *p)
> +{
> +    /*
> +     * QEMU older than 9.0 don't understand zero page
> +     * on multifd channel. This switch is required to
> +     * maintain backward compatibility.
> +     */
> +    bool use_multifd_zero_page =
> +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> +    MultiFDPages_t *pages = p->pages;
> +    RAMBlock *rb = pages->block;
> +
> +    assert(pages->num != 0);
>

Not needed, the check is done right before calling send_prepare.


> +    assert(pages->normal_num == 0);
> +    assert(pages->zero_num == 0);
>

Why these asserts are needed?

> +
>
+    for (int i = 0; i < pages->num; i++) {
> +        uint64_t offset = pages->offset[i];
> +        if (use_multifd_zero_page &&
> +            buffer_is_zero(rb->host + offset, p->page_size)) {
> +            pages->zero[pages->zero_num] = offset;
> +            pages->zero_num++;
> +            ram_release_page(rb->idstr, offset);
> +        } else {
> +            pages->normal[pages->normal_num] = offset;
> +            pages->normal_num++;
> +        }
> +    }
> +}
> +
> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> +{
> +    for (int i = 0; i < p->zero_num; i++) {
> +        void *page = p->host + p->zero[i];
> +        if (!buffer_is_zero(page, p->page_size)) {
> +            memset(page, 0, p->page_size);
> +        }
> +    }
> +}
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 012e3bdea1..cdfe0fa70e 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p,
> Error **errp)
>      int ret;
>      uint32_t i;
>
> +    multifd_zero_page_check_send(p);
> +
> +    if (!pages->normal_num) {
> +        p->next_packet_size = 0;
> +        goto out;
> +    }
> +
>      multifd_send_prepare_header(p);
>
> -    for (i = 0; i < pages->num; i++) {
> +    for (i = 0; i < pages->normal_num; i++) {
>          uint32_t available = z->zbuff_len - out_size;
>          int flush = Z_NO_FLUSH;
>
> -        if (i == pages->num - 1) {
> +        if (i == pages->normal_num - 1) {
>              flush = Z_SYNC_FLUSH;
>          }
>
> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p,
> Error **errp)
>           * with compression. zlib does not guarantee that this is safe,
>           * therefore copy the page before calling deflate().
>           */
> -        memcpy(z->buf, p->pages->block->host + pages->offset[i],
> p->page_size);
> +        memcpy(z->buf, p->pages->block->host + pages->normal[i],
> p->page_size);
>          zs->avail_in = p->page_size;
>          zs->next_in = z->buf;
>
> @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p,
> Error **errp)
>      p->iov[p->iovs_num].iov_len = out_size;
>      p->iovs_num++;
>      p->next_packet_size = out_size;
> -    p->flags |= MULTIFD_FLAG_ZLIB;
>
> +out:
> +    p->flags |= MULTIFD_FLAG_ZLIB;
>      multifd_send_fill_packet(p);
> -
>
Spurious?

     return 0;
>  }
>
> @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p,
> Error **errp)
>                     p->id, flags, MULTIFD_FLAG_ZLIB);
>          return -1;
>      }
> +
> +    multifd_zero_page_check_recv(p);
> +
> +    if (!p->normal_num) {
> +        assert(in_size == 0);
> +        return 0;
>

return here will have no effect. Also, why is assert needed here?
This change also does not seem to fit the description of the patch, probaby
separate patch will be better.


> +    }
> +
>      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>
>      if (ret != 0) {
> @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error
> **errp)
>                     p->id, out_size, expected_size);
>          return -1;
>      }
> +
>      return 0;
>  }
>
> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> index dc8fe43e94..27a1eba075 100644
> --- a/migration/multifd-zstd.c
> +++ b/migration/multifd-zstd.c
> @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p,
> Error **errp)
>      int ret;
>      uint32_t i;
>
> +    multifd_zero_page_check_send(p);
> +
> +    if (!pages->normal_num) {
> +        p->next_packet_size = 0;
> +        goto out;
> +    }
> +
>      multifd_send_prepare_header(p);
>
>      z->out.dst = z->zbuff;
>      z->out.size = z->zbuff_len;
>      z->out.pos = 0;
>
> -    for (i = 0; i < pages->num; i++) {
> +    for (i = 0; i < pages->normal_num; i++) {
>          ZSTD_EndDirective flush = ZSTD_e_continue;
>
> -        if (i == pages->num - 1) {
> +        if (i == pages->normal_num - 1) {
>              flush = ZSTD_e_flush;
>          }
> -        z->in.src = p->pages->block->host + pages->offset[i];
> +        z->in.src = p->pages->block->host + pages->normal[i];
>          z->in.size = p->page_size;
>          z->in.pos = 0;
>
> @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p,
> Error **errp)
>      p->iov[p->iovs_num].iov_len = z->out.pos;
>      p->iovs_num++;
>      p->next_packet_size = z->out.pos;
> -    p->flags |= MULTIFD_FLAG_ZSTD;
>
> +out:
> +    p->flags |= MULTIFD_FLAG_ZSTD;
>      multifd_send_fill_packet(p);
> -
>
Spurious removal.


>      return 0;
>  }
>
> @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p,
> Error **errp)
>                     p->id, flags, MULTIFD_FLAG_ZSTD);
>          return -1;
>      }
> +
> +    multifd_zero_page_check_recv(p);
> +
> +    if (!p->normal_num) {
> +        assert(in_size == 0);
> +        return 0;
> +    }
> +
>
Same question here about assert.


>      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>
>      if (ret != 0) {
> diff --git a/migration/multifd.c b/migration/multifd.c
> index a33dba40d9..fbb40ea10b 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -11,6 +11,7 @@
>   */
>
>  #include "qemu/osdep.h"
> +#include "qemu/cutils.h"
>  #include "qemu/rcu.h"
>  #include "exec/target_page.h"
>  #include "sysemu/sysemu.h"
> @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p,
> Error **errp)
>      MultiFDPages_t *pages = p->pages;
>      int ret;
>
> +    multifd_zero_page_check_send(p);
> +
>      if (!use_zero_copy_send) {
>          /*
>           * Only !zerocopy needs the header in IOV; zerocopy will
> @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p,
> Error **errp)
>          multifd_send_prepare_header(p);
>      }
>
> -    for (int i = 0; i < pages->num; i++) {
> -        p->iov[p->iovs_num].iov_base = pages->block->host +
> pages->offset[i];
> +    for (int i = 0; i < pages->normal_num; i++) {
> +        p->iov[p->iovs_num].iov_base = pages->block->host +
> pages->normal[i];
>          p->iov[p->iovs_num].iov_len = p->page_size;
>          p->iovs_num++;
>      }
>
> -    p->next_packet_size = pages->num * p->page_size;
> +    p->next_packet_size = pages->normal_num * p->page_size;
>      p->flags |= MULTIFD_FLAG_NOCOMP;
>
>      multifd_send_fill_packet(p);
> @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p,
> Error **errp)
>                     p->id, flags, MULTIFD_FLAG_NOCOMP);
>          return -1;
>      }
> +
> +    multifd_zero_page_check_recv(p);
> +
> +    if (!p->normal_num) {
> +        return 0;
> +    }
> +
>      for (int i = 0; i < p->normal_num; i++) {
>          p->iov[i].iov_base = p->host + p->normal[i];
>          p->iov[i].iov_len = p->page_size;
> @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>
>      packet->flags = cpu_to_be32(p->flags);
>      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
> -    packet->normal_pages = cpu_to_be32(pages->num);
> +    packet->normal_pages = cpu_to_be32(pages->normal_num);
>      packet->zero_pages = cpu_to_be32(pages->zero_num);
>      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>
> @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>          strncpy(packet->ramblock, pages->block->idstr, 256);
>      }
>
> -    for (i = 0; i < pages->num; i++) {
> +    for (i = 0; i < pages->normal_num; i++) {
>          /* there are architectures where ram_addr_t is 32 bit */
> -        uint64_t temp = pages->offset[i];
> +        uint64_t temp = pages->normal[i];
>
>          packet->offset[i] = cpu_to_be64(temp);
>      }
>
> +    for (i = 0; i < pages->zero_num; i++) {
> +        /* there are architectures where ram_addr_t is 32 bit */
> +        uint64_t temp = pages->zero[i];
> +
> +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
> +    }
> +
>      p->packets_sent++;
> -    p->total_normal_pages += pages->num;
> +    p->total_normal_pages += pages->normal_num;
>      p->total_zero_pages += pages->zero_num;
>
> -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> +    trace_multifd_send(p->id, packet_num, pages->normal_num,
> pages->zero_num,
>                         p->flags, p->next_packet_size);
>  }
>
> @@ -451,6 +468,18 @@ static int
> multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>          p->normal[i] = offset;
>      }
>
> +    for (i = 0; i < p->zero_num; i++) {
> +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> +
> +        if (offset > (p->block->used_length - p->page_size)) {
> +            error_setg(errp, "multifd: offset too long %" PRIu64
> +                       " (max " RAM_ADDR_FMT ")",
> +                       offset, p->block->used_length);
> +            return -1;
> +        }
> +        p->zero[i] = offset;
> +    }
> +
>      return 0;
>  }
>
> @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
>
>              stat64_add(&mig_stats.multifd_bytes,
>                         p->next_packet_size + p->packet_len);
> -            stat64_add(&mig_stats.normal_pages, pages->num);
> +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
>              stat64_add(&mig_stats.zero_pages, pages->zero_num);
>
>              multifd_pages_reset(p->pages);
> @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
>          p->flags &= ~MULTIFD_FLAG_SYNC;
>          qemu_mutex_unlock(&p->mutex);
>
> -        if (p->normal_num) {
> +        if (p->normal_num + p->zero_num) {
> +            assert(!(flags & MULTIFD_FLAG_SYNC));
>
This assertion seems to be not relevant to this patch. Could you post it
separately and explain why it's needed here?


>              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
>              if (ret != 0) {
>                  break;
> diff --git a/migration/multifd.h b/migration/multifd.h
> index 9822ff298a..125f0bbe60 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -53,6 +53,11 @@ typedef struct {
>      uint32_t unused32[1];    /* Reserved for future use */
>      uint64_t unused64[3];    /* Reserved for future use */
>      char ramblock[256];
> +    /*
> +     * This array contains the pointers to:
> +     *  - normal pages (initial normal_pages entries)
> +     *  - zero pages (following zero_pages entries)
> +     */
>      uint64_t offset[];
>  } __attribute__((packed)) MultiFDPacket_t;
>
> @@ -224,6 +229,8 @@ typedef struct {
>
>  void multifd_register_ops(int method, MultiFDMethods *ops);
>  void multifd_send_fill_packet(MultiFDSendParams *p);
> +void multifd_zero_page_check_send(MultiFDSendParams *p);
> +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
>
>  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
>  {
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 99843a8e95..e2450b92d4 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -660,9 +660,11 @@
>  #
>  # @none: Do not perform zero page checking.
>  #
> +# @multifd: Perform zero page checking on the multifd sender thread.
> (since 9.0)
> +#
>  ##
>  { 'enum': 'ZeroPageDetection',
> -  'data': [ 'legacy', 'none' ] }
> +  'data': [ 'legacy', 'none', 'multifd' ] }
>
>  ##
>  # @BitmapMigrationBitmapAliasTransform:
> --
> 2.30.2
>
>
>
Fabiano Rosas Feb. 21, 2024, 9:04 p.m. UTC | #4
Hao Xiang <hao.xiang@bytedance.com> writes:

> 1. Implements the zero page detection and handling on the multifd
> threads for non-compression, zlib and zstd compression backends.
> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> 3. Add proper asserts to ensure pages->normal are used for normal pages
> in all scenarios.
>
> Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> ---
>  migration/meson.build         |  1 +
>  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
>  migration/multifd-zlib.c      | 26 ++++++++++++---
>  migration/multifd-zstd.c      | 25 ++++++++++++---
>  migration/multifd.c           | 50 +++++++++++++++++++++++------
>  migration/multifd.h           |  7 +++++
>  qapi/migration.json           |  4 ++-
>  7 files changed, 151 insertions(+), 21 deletions(-)
>  create mode 100644 migration/multifd-zero-page.c
>
> diff --git a/migration/meson.build b/migration/meson.build
> index 92b1cc4297..1eeb915ff6 100644
> --- a/migration/meson.build
> +++ b/migration/meson.build
> @@ -22,6 +22,7 @@ system_ss.add(files(
>    'migration.c',
>    'multifd.c',
>    'multifd-zlib.c',
> +  'multifd-zero-page.c',
>    'ram-compress.c',
>    'options.c',
>    'postcopy-ram.c',
> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> new file mode 100644
> index 0000000000..f0cd8e2c53
> --- /dev/null
> +++ b/migration/multifd-zero-page.c
> @@ -0,0 +1,59 @@
> +/*
> + * Multifd zero page detection implementation.
> + *
> + * Copyright (c) 2024 Bytedance Inc
> + *
> + * Authors:
> + *  Hao Xiang <hao.xiang@bytedance.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qemu/cutils.h"
> +#include "exec/ramblock.h"
> +#include "migration.h"
> +#include "multifd.h"
> +#include "options.h"
> +#include "ram.h"
> +
> +void multifd_zero_page_check_send(MultiFDSendParams *p)
> +{
> +    /*
> +     * QEMU older than 9.0 don't understand zero page
> +     * on multifd channel. This switch is required to
> +     * maintain backward compatibility.
> +     */
> +    bool use_multifd_zero_page =
> +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> +    MultiFDPages_t *pages = p->pages;
> +    RAMBlock *rb = pages->block;
> +
> +    assert(pages->num != 0);
> +    assert(pages->normal_num == 0);
> +    assert(pages->zero_num == 0);

We can drop these before the final version.

> +
> +    for (int i = 0; i < pages->num; i++) {
> +        uint64_t offset = pages->offset[i];
> +        if (use_multifd_zero_page &&
> +            buffer_is_zero(rb->host + offset, p->page_size)) {
> +            pages->zero[pages->zero_num] = offset;
> +            pages->zero_num++;
> +            ram_release_page(rb->idstr, offset);
> +        } else {
> +            pages->normal[pages->normal_num] = offset;
> +            pages->normal_num++;
> +        }
> +    }

I don't think it's super clean to have three arrays offset, zero and
normal, all sized for the full packet size. It might be possible to just
carry a bitmap of non-zero pages along with pages->offset and operate on
that instead.

What do you think?

Peter, any ideas? Should we just leave this for another time?

> +}
> +
> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> +{
> +    for (int i = 0; i < p->zero_num; i++) {
> +        void *page = p->host + p->zero[i];
> +        if (!buffer_is_zero(page, p->page_size)) {
> +            memset(page, 0, p->page_size);
> +        }
> +    }
> +}
> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> index 012e3bdea1..cdfe0fa70e 100644
> --- a/migration/multifd-zlib.c
> +++ b/migration/multifd-zlib.c
> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>      int ret;
>      uint32_t i;
>  
> +    multifd_zero_page_check_send(p);
> +
> +    if (!pages->normal_num) {
> +        p->next_packet_size = 0;
> +        goto out;
> +    }
> +
>      multifd_send_prepare_header(p);
>  
> -    for (i = 0; i < pages->num; i++) {
> +    for (i = 0; i < pages->normal_num; i++) {
>          uint32_t available = z->zbuff_len - out_size;
>          int flush = Z_NO_FLUSH;
>  
> -        if (i == pages->num - 1) {
> +        if (i == pages->normal_num - 1) {
>              flush = Z_SYNC_FLUSH;
>          }
>  
> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>           * with compression. zlib does not guarantee that this is safe,
>           * therefore copy the page before calling deflate().
>           */
> -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
> +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
>          zs->avail_in = p->page_size;
>          zs->next_in = z->buf;
>  
> @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>      p->iov[p->iovs_num].iov_len = out_size;
>      p->iovs_num++;
>      p->next_packet_size = out_size;
> -    p->flags |= MULTIFD_FLAG_ZLIB;
>  
> +out:
> +    p->flags |= MULTIFD_FLAG_ZLIB;
>      multifd_send_fill_packet(p);
> -
>      return 0;
>  }
>  
> @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
>                     p->id, flags, MULTIFD_FLAG_ZLIB);
>          return -1;
>      }
> +
> +    multifd_zero_page_check_recv(p);
> +
> +    if (!p->normal_num) {
> +        assert(in_size == 0);
> +        return 0;
> +    }
> +
>      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>  
>      if (ret != 0) {
> @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
>                     p->id, out_size, expected_size);
>          return -1;
>      }
> +
>      return 0;
>  }
>  
> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> index dc8fe43e94..27a1eba075 100644
> --- a/migration/multifd-zstd.c
> +++ b/migration/multifd-zstd.c
> @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>      int ret;
>      uint32_t i;
>  
> +    multifd_zero_page_check_send(p);
> +
> +    if (!pages->normal_num) {
> +        p->next_packet_size = 0;
> +        goto out;
> +    }
> +
>      multifd_send_prepare_header(p);
>  
>      z->out.dst = z->zbuff;
>      z->out.size = z->zbuff_len;
>      z->out.pos = 0;
>  
> -    for (i = 0; i < pages->num; i++) {
> +    for (i = 0; i < pages->normal_num; i++) {
>          ZSTD_EndDirective flush = ZSTD_e_continue;
>  
> -        if (i == pages->num - 1) {
> +        if (i == pages->normal_num - 1) {
>              flush = ZSTD_e_flush;
>          }
> -        z->in.src = p->pages->block->host + pages->offset[i];
> +        z->in.src = p->pages->block->host + pages->normal[i];
>          z->in.size = p->page_size;
>          z->in.pos = 0;
>  
> @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>      p->iov[p->iovs_num].iov_len = z->out.pos;
>      p->iovs_num++;
>      p->next_packet_size = z->out.pos;
> -    p->flags |= MULTIFD_FLAG_ZSTD;
>  
> +out:
> +    p->flags |= MULTIFD_FLAG_ZSTD;
>      multifd_send_fill_packet(p);
> -
>      return 0;
>  }
>  
> @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
>                     p->id, flags, MULTIFD_FLAG_ZSTD);
>          return -1;
>      }
> +
> +    multifd_zero_page_check_recv(p);
> +
> +    if (!p->normal_num) {
> +        assert(in_size == 0);
> +        return 0;
> +    }
> +
>      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>  
>      if (ret != 0) {
> diff --git a/migration/multifd.c b/migration/multifd.c
> index a33dba40d9..fbb40ea10b 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -11,6 +11,7 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/cutils.h"
>  #include "qemu/rcu.h"
>  #include "exec/target_page.h"
>  #include "sysemu/sysemu.h"
> @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>      MultiFDPages_t *pages = p->pages;
>      int ret;
>  
> +    multifd_zero_page_check_send(p);
> +
>      if (!use_zero_copy_send) {
>          /*
>           * Only !zerocopy needs the header in IOV; zerocopy will
> @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>          multifd_send_prepare_header(p);
>      }
>  
> -    for (int i = 0; i < pages->num; i++) {
> -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
> +    for (int i = 0; i < pages->normal_num; i++) {
> +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
>          p->iov[p->iovs_num].iov_len = p->page_size;
>          p->iovs_num++;
>      }
>  
> -    p->next_packet_size = pages->num * p->page_size;
> +    p->next_packet_size = pages->normal_num * p->page_size;
>      p->flags |= MULTIFD_FLAG_NOCOMP;
>  
>      multifd_send_fill_packet(p);
> @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
>                     p->id, flags, MULTIFD_FLAG_NOCOMP);
>          return -1;
>      }
> +
> +    multifd_zero_page_check_recv(p);
> +
> +    if (!p->normal_num) {
> +        return 0;
> +    }
> +
>      for (int i = 0; i < p->normal_num; i++) {
>          p->iov[i].iov_base = p->host + p->normal[i];
>          p->iov[i].iov_len = p->page_size;
> @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>  
>      packet->flags = cpu_to_be32(p->flags);
>      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
> -    packet->normal_pages = cpu_to_be32(pages->num);
> +    packet->normal_pages = cpu_to_be32(pages->normal_num);
>      packet->zero_pages = cpu_to_be32(pages->zero_num);
>      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>  
> @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>          strncpy(packet->ramblock, pages->block->idstr, 256);
>      }
>  
> -    for (i = 0; i < pages->num; i++) {
> +    for (i = 0; i < pages->normal_num; i++) {
>          /* there are architectures where ram_addr_t is 32 bit */
> -        uint64_t temp = pages->offset[i];
> +        uint64_t temp = pages->normal[i];
>  
>          packet->offset[i] = cpu_to_be64(temp);
>      }
>  
> +    for (i = 0; i < pages->zero_num; i++) {
> +        /* there are architectures where ram_addr_t is 32 bit */
> +        uint64_t temp = pages->zero[i];
> +
> +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
> +    }
> +
>      p->packets_sent++;
> -    p->total_normal_pages += pages->num;
> +    p->total_normal_pages += pages->normal_num;
>      p->total_zero_pages += pages->zero_num;
>  
> -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
>                         p->flags, p->next_packet_size);
>  }
>  
> @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>          p->normal[i] = offset;
>      }
>  
> +    for (i = 0; i < p->zero_num; i++) {
> +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> +
> +        if (offset > (p->block->used_length - p->page_size)) {
> +            error_setg(errp, "multifd: offset too long %" PRIu64
> +                       " (max " RAM_ADDR_FMT ")",
> +                       offset, p->block->used_length);
> +            return -1;
> +        }
> +        p->zero[i] = offset;
> +    }
> +
>      return 0;
>  }
>  
> @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
>  
>              stat64_add(&mig_stats.multifd_bytes,
>                         p->next_packet_size + p->packet_len);
> -            stat64_add(&mig_stats.normal_pages, pages->num);
> +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
>              stat64_add(&mig_stats.zero_pages, pages->zero_num);
>  
>              multifd_pages_reset(p->pages);
> @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
>          p->flags &= ~MULTIFD_FLAG_SYNC;
>          qemu_mutex_unlock(&p->mutex);
>  
> -        if (p->normal_num) {
> +        if (p->normal_num + p->zero_num) {
> +            assert(!(flags & MULTIFD_FLAG_SYNC));

This breaks 8.2 -> 9.0 migration. QEMU 8.2 is still sending the SYNC
along with the data packet.

>              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
>              if (ret != 0) {
>                  break;
> diff --git a/migration/multifd.h b/migration/multifd.h
> index 9822ff298a..125f0bbe60 100644
> --- a/migration/multifd.h
> +++ b/migration/multifd.h
> @@ -53,6 +53,11 @@ typedef struct {
>      uint32_t unused32[1];    /* Reserved for future use */
>      uint64_t unused64[3];    /* Reserved for future use */
>      char ramblock[256];
> +    /*
> +     * This array contains the pointers to:
> +     *  - normal pages (initial normal_pages entries)
> +     *  - zero pages (following zero_pages entries)
> +     */
>      uint64_t offset[];
>  } __attribute__((packed)) MultiFDPacket_t;
>  
> @@ -224,6 +229,8 @@ typedef struct {
>  
>  void multifd_register_ops(int method, MultiFDMethods *ops);
>  void multifd_send_fill_packet(MultiFDSendParams *p);
> +void multifd_zero_page_check_send(MultiFDSendParams *p);
> +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
>  
>  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
>  {
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 99843a8e95..e2450b92d4 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -660,9 +660,11 @@
>  #
>  # @none: Do not perform zero page checking.
>  #
> +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
> +#
>  ##
>  { 'enum': 'ZeroPageDetection',
> -  'data': [ 'legacy', 'none' ] }
> +  'data': [ 'legacy', 'none', 'multifd' ] }
>  
>  ##
>  # @BitmapMigrationBitmapAliasTransform:
Peter Xu Feb. 23, 2024, 2:20 a.m. UTC | #5
On Wed, Feb 21, 2024 at 06:04:10PM -0300, Fabiano Rosas wrote:
> Hao Xiang <hao.xiang@bytedance.com> writes:
> 
> > 1. Implements the zero page detection and handling on the multifd
> > threads for non-compression, zlib and zstd compression backends.
> > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > in all scenarios.
> >
> > Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> > ---
> >  migration/meson.build         |  1 +
> >  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
> >  migration/multifd-zlib.c      | 26 ++++++++++++---
> >  migration/multifd-zstd.c      | 25 ++++++++++++---
> >  migration/multifd.c           | 50 +++++++++++++++++++++++------
> >  migration/multifd.h           |  7 +++++
> >  qapi/migration.json           |  4 ++-
> >  7 files changed, 151 insertions(+), 21 deletions(-)
> >  create mode 100644 migration/multifd-zero-page.c
> >
> > diff --git a/migration/meson.build b/migration/meson.build
> > index 92b1cc4297..1eeb915ff6 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -22,6 +22,7 @@ system_ss.add(files(
> >    'migration.c',
> >    'multifd.c',
> >    'multifd-zlib.c',
> > +  'multifd-zero-page.c',
> >    'ram-compress.c',
> >    'options.c',
> >    'postcopy-ram.c',
> > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > new file mode 100644
> > index 0000000000..f0cd8e2c53
> > --- /dev/null
> > +++ b/migration/multifd-zero-page.c
> > @@ -0,0 +1,59 @@
> > +/*
> > + * Multifd zero page detection implementation.
> > + *
> > + * Copyright (c) 2024 Bytedance Inc
> > + *
> > + * Authors:
> > + *  Hao Xiang <hao.xiang@bytedance.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> > +#include "exec/ramblock.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "options.h"
> > +#include "ram.h"
> > +
> > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > +{
> > +    /*
> > +     * QEMU older than 9.0 don't understand zero page
> > +     * on multifd channel. This switch is required to
> > +     * maintain backward compatibility.
> > +     */
> > +    bool use_multifd_zero_page =
> > +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > +    MultiFDPages_t *pages = p->pages;
> > +    RAMBlock *rb = pages->block;
> > +
> > +    assert(pages->num != 0);
> > +    assert(pages->normal_num == 0);
> > +    assert(pages->zero_num == 0);
> 
> We can drop these before the final version.
> 
> > +
> > +    for (int i = 0; i < pages->num; i++) {
> > +        uint64_t offset = pages->offset[i];
> > +        if (use_multifd_zero_page &&
> > +            buffer_is_zero(rb->host + offset, p->page_size)) {
> > +            pages->zero[pages->zero_num] = offset;
> > +            pages->zero_num++;
> > +            ram_release_page(rb->idstr, offset);
> > +        } else {
> > +            pages->normal[pages->normal_num] = offset;
> > +            pages->normal_num++;
> > +        }
> > +    }
> 
> I don't think it's super clean to have three arrays offset, zero and
> normal, all sized for the full packet size. It might be possible to just
> carry a bitmap of non-zero pages along with pages->offset and operate on
> that instead.
> 
> What do you think?
> 
> Peter, any ideas? Should we just leave this for another time?

Yeah I think a bitmap should save quite a few fields indeed, it'll however
make the latter iteration slightly harder by walking both (offset[],
bitmap), process the page only if bitmap is set for the offset.

IIUC we perhaps don't even need a bitmap?  AFAIU what we only need in
Multifdpages_t is one extra field to mark "how many normal pages", aka,
normal_num here (zero_num can be calculated from num-normal_num).  Then
the zero page detection logic should do two things:

  - Sort offset[] array so that it starts with normal pages, followed up by
    zero pages

  - Setup normal_num to be the number of normal pages

Then we reduce 2 new arrays (normal[], zero[]) + 2 new fields (normal_num,
zero_num) -> 1 new field (normal_num).  It'll also be trivial to fill the
packet header later because offset[] is exactly that.

Side note - I still think it's confusing to read this patch and previous
patch separately.  Obviously previous patch introduced these new fields
without justifying their values yet.  IMHO it'll be easier to review if you
merge the two patches.

> 
> > +}
> > +
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > +{
> > +    for (int i = 0; i < p->zero_num; i++) {
> > +        void *page = p->host + p->zero[i];
> > +        if (!buffer_is_zero(page, p->page_size)) {
> > +            memset(page, 0, p->page_size);
> > +        }
> > +    }
> > +}
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 012e3bdea1..cdfe0fa70e 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> >      int ret;
> >      uint32_t i;
> >  
> > +    multifd_zero_page_check_send(p);
> > +
> > +    if (!pages->normal_num) {
> > +        p->next_packet_size = 0;
> > +        goto out;
> > +    }
> > +
> >      multifd_send_prepare_header(p);
> >  
> > -    for (i = 0; i < pages->num; i++) {
> > +    for (i = 0; i < pages->normal_num; i++) {
> >          uint32_t available = z->zbuff_len - out_size;
> >          int flush = Z_NO_FLUSH;
> >  
> > -        if (i == pages->num - 1) {
> > +        if (i == pages->normal_num - 1) {
> >              flush = Z_SYNC_FLUSH;
> >          }
> >  
> > @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> >           * with compression. zlib does not guarantee that this is safe,
> >           * therefore copy the page before calling deflate().
> >           */
> > -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
> > +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
> >          zs->avail_in = p->page_size;
> >          zs->next_in = z->buf;
> >  
> > @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> >      p->iov[p->iovs_num].iov_len = out_size;
> >      p->iovs_num++;
> >      p->next_packet_size = out_size;
> > -    p->flags |= MULTIFD_FLAG_ZLIB;
> >  
> > +out:
> > +    p->flags |= MULTIFD_FLAG_ZLIB;
> >      multifd_send_fill_packet(p);
> > -
> >      return 0;
> >  }
> >  
> > @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, flags, MULTIFD_FLAG_ZLIB);
> >          return -1;
> >      }
> > +
> > +    multifd_zero_page_check_recv(p);
> > +
> > +    if (!p->normal_num) {
> > +        assert(in_size == 0);
> > +        return 0;
> > +    }
> > +
> >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> >  
> >      if (ret != 0) {
> > @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, out_size, expected_size);
> >          return -1;
> >      }
> > +
> >      return 0;
> >  }
> >  
> > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > index dc8fe43e94..27a1eba075 100644
> > --- a/migration/multifd-zstd.c
> > +++ b/migration/multifd-zstd.c
> > @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> >      int ret;
> >      uint32_t i;
> >  
> > +    multifd_zero_page_check_send(p);
> > +
> > +    if (!pages->normal_num) {
> > +        p->next_packet_size = 0;
> > +        goto out;
> > +    }
> > +
> >      multifd_send_prepare_header(p);

If this forms a pattern we can introduce multifd_send_prepare_common():

bool multifd_send_prepare_common()
{
    multifd_zero_page_check_send();
    if (...) {
    
    }
    multifd_send_prepare_header();
}

> >  
> >      z->out.dst = z->zbuff;
> >      z->out.size = z->zbuff_len;
> >      z->out.pos = 0;
> >  
> > -    for (i = 0; i < pages->num; i++) {
> > +    for (i = 0; i < pages->normal_num; i++) {
> >          ZSTD_EndDirective flush = ZSTD_e_continue;
> >  
> > -        if (i == pages->num - 1) {
> > +        if (i == pages->normal_num - 1) {
> >              flush = ZSTD_e_flush;
> >          }
> > -        z->in.src = p->pages->block->host + pages->offset[i];
> > +        z->in.src = p->pages->block->host + pages->normal[i];
> >          z->in.size = p->page_size;
> >          z->in.pos = 0;
> >  
> > @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> >      p->iov[p->iovs_num].iov_len = z->out.pos;
> >      p->iovs_num++;
> >      p->next_packet_size = z->out.pos;
> > -    p->flags |= MULTIFD_FLAG_ZSTD;
> >  
> > +out:
> > +    p->flags |= MULTIFD_FLAG_ZSTD;
> >      multifd_send_fill_packet(p);
> > -
> >      return 0;
> >  }
> >  
> > @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, flags, MULTIFD_FLAG_ZSTD);
> >          return -1;
> >      }
> > +
> > +    multifd_zero_page_check_recv(p);
> > +
> > +    if (!p->normal_num) {
> > +        assert(in_size == 0);
> > +        return 0;
> > +    }
> > +
> >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> >  
> >      if (ret != 0) {
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index a33dba40d9..fbb40ea10b 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -11,6 +11,7 @@
> >   */
> >  
> >  #include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> >  #include "qemu/rcu.h"
> >  #include "exec/target_page.h"
> >  #include "sysemu/sysemu.h"
> > @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> >      MultiFDPages_t *pages = p->pages;
> >      int ret;
> >  
> > +    multifd_zero_page_check_send(p);
> > +
> >      if (!use_zero_copy_send) {
> >          /*
> >           * Only !zerocopy needs the header in IOV; zerocopy will
> > @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> >          multifd_send_prepare_header(p);
> >      }
> >  
> > -    for (int i = 0; i < pages->num; i++) {
> > -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
> > +    for (int i = 0; i < pages->normal_num; i++) {
> > +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
> >          p->iov[p->iovs_num].iov_len = p->page_size;
> >          p->iovs_num++;
> >      }
> >  
> > -    p->next_packet_size = pages->num * p->page_size;
> > +    p->next_packet_size = pages->normal_num * p->page_size;
> >      p->flags |= MULTIFD_FLAG_NOCOMP;
> >  
> >      multifd_send_fill_packet(p);
> > @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, flags, MULTIFD_FLAG_NOCOMP);
> >          return -1;
> >      }
> > +
> > +    multifd_zero_page_check_recv(p);
> > +
> > +    if (!p->normal_num) {
> > +        return 0;
> > +    }
> > +
> >      for (int i = 0; i < p->normal_num; i++) {
> >          p->iov[i].iov_base = p->host + p->normal[i];
> >          p->iov[i].iov_len = p->page_size;
> > @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> >  
> >      packet->flags = cpu_to_be32(p->flags);
> >      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
> > -    packet->normal_pages = cpu_to_be32(pages->num);
> > +    packet->normal_pages = cpu_to_be32(pages->normal_num);
> >      packet->zero_pages = cpu_to_be32(pages->zero_num);
> >      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
> >  
> > @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> >          strncpy(packet->ramblock, pages->block->idstr, 256);
> >      }
> >  
> > -    for (i = 0; i < pages->num; i++) {
> > +    for (i = 0; i < pages->normal_num; i++) {
> >          /* there are architectures where ram_addr_t is 32 bit */
> > -        uint64_t temp = pages->offset[i];
> > +        uint64_t temp = pages->normal[i];
> >  
> >          packet->offset[i] = cpu_to_be64(temp);
> >      }
> >  
> > +    for (i = 0; i < pages->zero_num; i++) {
> > +        /* there are architectures where ram_addr_t is 32 bit */
> > +        uint64_t temp = pages->zero[i];
> > +
> > +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
> > +    }
> > +
> >      p->packets_sent++;
> > -    p->total_normal_pages += pages->num;
> > +    p->total_normal_pages += pages->normal_num;
> >      p->total_zero_pages += pages->zero_num;
> >  
> > -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> > +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
> >                         p->flags, p->next_packet_size);
> >  }
> >  
> > @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >          p->normal[i] = offset;
> >      }
> >  
> > +    for (i = 0; i < p->zero_num; i++) {
> > +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> > +
> > +        if (offset > (p->block->used_length - p->page_size)) {
> > +            error_setg(errp, "multifd: offset too long %" PRIu64
> > +                       " (max " RAM_ADDR_FMT ")",
> > +                       offset, p->block->used_length);
> > +            return -1;
> > +        }
> > +        p->zero[i] = offset;
> > +    }
> > +
> >      return 0;
> >  }
> >  
> > @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
> >  
> >              stat64_add(&mig_stats.multifd_bytes,
> >                         p->next_packet_size + p->packet_len);
> > -            stat64_add(&mig_stats.normal_pages, pages->num);
> > +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
> >              stat64_add(&mig_stats.zero_pages, pages->zero_num);
> >  
> >              multifd_pages_reset(p->pages);
> > @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
> >          p->flags &= ~MULTIFD_FLAG_SYNC;
> >          qemu_mutex_unlock(&p->mutex);
> >  
> > -        if (p->normal_num) {
> > +        if (p->normal_num + p->zero_num) {
> > +            assert(!(flags & MULTIFD_FLAG_SYNC));
> 
> This breaks 8.2 -> 9.0 migration. QEMU 8.2 is still sending the SYNC
> along with the data packet.
> 
> >              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
> >              if (ret != 0) {
> >                  break;
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index 9822ff298a..125f0bbe60 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -53,6 +53,11 @@ typedef struct {
> >      uint32_t unused32[1];    /* Reserved for future use */
> >      uint64_t unused64[3];    /* Reserved for future use */
> >      char ramblock[256];
> > +    /*
> > +     * This array contains the pointers to:
> > +     *  - normal pages (initial normal_pages entries)
> > +     *  - zero pages (following zero_pages entries)
> > +     */
> >      uint64_t offset[];
> >  } __attribute__((packed)) MultiFDPacket_t;
> >  
> > @@ -224,6 +229,8 @@ typedef struct {
> >  
> >  void multifd_register_ops(int method, MultiFDMethods *ops);
> >  void multifd_send_fill_packet(MultiFDSendParams *p);
> > +void multifd_zero_page_check_send(MultiFDSendParams *p);
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
> >  
> >  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
> >  {
> > diff --git a/qapi/migration.json b/qapi/migration.json
> > index 99843a8e95..e2450b92d4 100644
> > --- a/qapi/migration.json
> > +++ b/qapi/migration.json
> > @@ -660,9 +660,11 @@
> >  #
> >  # @none: Do not perform zero page checking.
> >  #
> > +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
> > +#
> >  ##
> >  { 'enum': 'ZeroPageDetection',
> > -  'data': [ 'legacy', 'none' ] }
> > +  'data': [ 'legacy', 'none', 'multifd' ] }
> >  
> >  ##
> >  # @BitmapMigrationBitmapAliasTransform:
>
Hao Xiang Feb. 23, 2024, 4:38 a.m. UTC | #6
On Fri, Feb 16, 2024 at 9:08 PM Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> On 2/16/24 12:39, Hao Xiang wrote:
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > +{
> > +    for (int i = 0; i < p->zero_num; i++) {
> > +        void *page = p->host + p->zero[i];
> > +        if (!buffer_is_zero(page, p->page_size)) {
> > +            memset(page, 0, p->page_size);
> > +        }
> > +    }
> > +}
>
> You should not check the buffer is zero here, you should just zero it.

I will fix it in the next version.

>
>
> r~
Hao Xiang Feb. 23, 2024, 4:59 a.m. UTC | #7
On Wed, Feb 21, 2024 at 8:00 AM Elena Ufimtseva <ufimtseva@gmail.com> wrote:
>
>
>
> On Fri, Feb 16, 2024 at 2:42 PM Hao Xiang <hao.xiang@bytedance.com> wrote:
>>
>> 1. Implements the zero page detection and handling on the multifd
>> threads for non-compression, zlib and zstd compression backends.
>> 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
>> 3. Add proper asserts to ensure pages->normal are used for normal pages
>> in all scenarios.
>>
>> Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
>> ---
>>  migration/meson.build         |  1 +
>>  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
>>  migration/multifd-zlib.c      | 26 ++++++++++++---
>>  migration/multifd-zstd.c      | 25 ++++++++++++---
>>  migration/multifd.c           | 50 +++++++++++++++++++++++------
>>  migration/multifd.h           |  7 +++++
>>  qapi/migration.json           |  4 ++-
>>  7 files changed, 151 insertions(+), 21 deletions(-)
>>  create mode 100644 migration/multifd-zero-page.c
>>
>> diff --git a/migration/meson.build b/migration/meson.build
>> index 92b1cc4297..1eeb915ff6 100644
>> --- a/migration/meson.build
>> +++ b/migration/meson.build
>> @@ -22,6 +22,7 @@ system_ss.add(files(
>>    'migration.c',
>>    'multifd.c',
>>    'multifd-zlib.c',
>> +  'multifd-zero-page.c',
>>    'ram-compress.c',
>>    'options.c',
>>    'postcopy-ram.c',
>> diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
>> new file mode 100644
>> index 0000000000..f0cd8e2c53
>> --- /dev/null
>> +++ b/migration/multifd-zero-page.c
>> @@ -0,0 +1,59 @@
>> +/*
>> + * Multifd zero page detection implementation.
>> + *
>> + * Copyright (c) 2024 Bytedance Inc
>> + *
>> + * Authors:
>> + *  Hao Xiang <hao.xiang@bytedance.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qemu/cutils.h"
>> +#include "exec/ramblock.h"
>> +#include "migration.h"
>> +#include "multifd.h"
>> +#include "options.h"
>> +#include "ram.h"
>> +
>> +void multifd_zero_page_check_send(MultiFDSendParams *p)
>> +{
>> +    /*
>> +     * QEMU older than 9.0 don't understand zero page
>> +     * on multifd channel. This switch is required to
>> +     * maintain backward compatibility.
>> +     */
>> +    bool use_multifd_zero_page =
>> +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
>> +    MultiFDPages_t *pages = p->pages;
>> +    RAMBlock *rb = pages->block;
>> +
>> +    assert(pages->num != 0);
>
>
> Not needed, the check is done right before calling send_prepare.
>
>>
>> +    assert(pages->normal_num == 0);
>> +    assert(pages->zero_num == 0);
>
>
> Why these asserts are needed?

The idea is that when multifd_zero_page_check_send is called, I want
to make sure zero page checking was not processed on this packet
before. It is perhaps redundant. It won't compile in free build.

>>
>> +
>>
>> +    for (int i = 0; i < pages->num; i++) {
>> +        uint64_t offset = pages->offset[i];
>> +        if (use_multifd_zero_page &&
>> +            buffer_is_zero(rb->host + offset, p->page_size)) {
>> +            pages->zero[pages->zero_num] = offset;
>> +            pages->zero_num++;
>> +            ram_release_page(rb->idstr, offset);
>> +        } else {
>> +            pages->normal[pages->normal_num] = offset;
>> +            pages->normal_num++;
>> +        }
>> +    }
>> +}
>> +
>> +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
>> +{
>> +    for (int i = 0; i < p->zero_num; i++) {
>> +        void *page = p->host + p->zero[i];
>> +        if (!buffer_is_zero(page, p->page_size)) {
>> +            memset(page, 0, p->page_size);
>> +        }
>> +    }
>> +}
>> diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
>> index 012e3bdea1..cdfe0fa70e 100644
>> --- a/migration/multifd-zlib.c
>> +++ b/migration/multifd-zlib.c
>> @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>>      int ret;
>>      uint32_t i;
>>
>> +    multifd_zero_page_check_send(p);
>> +
>> +    if (!pages->normal_num) {
>> +        p->next_packet_size = 0;
>> +        goto out;
>> +    }
>> +
>>      multifd_send_prepare_header(p);
>>
>> -    for (i = 0; i < pages->num; i++) {
>> +    for (i = 0; i < pages->normal_num; i++) {
>>          uint32_t available = z->zbuff_len - out_size;
>>          int flush = Z_NO_FLUSH;
>>
>> -        if (i == pages->num - 1) {
>> +        if (i == pages->normal_num - 1) {
>>              flush = Z_SYNC_FLUSH;
>>          }
>>
>> @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>>           * with compression. zlib does not guarantee that this is safe,
>>           * therefore copy the page before calling deflate().
>>           */
>> -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
>> +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
>>          zs->avail_in = p->page_size;
>>          zs->next_in = z->buf;
>>
>> @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>>      p->iov[p->iovs_num].iov_len = out_size;
>>      p->iovs_num++;
>>      p->next_packet_size = out_size;
>> -    p->flags |= MULTIFD_FLAG_ZLIB;
>>
>> +out:
>> +    p->flags |= MULTIFD_FLAG_ZLIB;
>>      multifd_send_fill_packet(p);
>> -
>
> Spurious?

We need to set the flag anyway otherwise the receiver side will complain.

>
>>      return 0;
>>  }
>>
>> @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
>>                     p->id, flags, MULTIFD_FLAG_ZLIB);
>>          return -1;
>>      }
>> +
>> +    multifd_zero_page_check_recv(p);
>> +
>> +    if (!p->normal_num) {
>> +        assert(in_size == 0);
>> +        return 0;
>
>
> return here will have no effect. Also, why is assert needed here?

We return here so we don't end up calling qio_channel_read_all with
buflen = 0. The assert makes sure that normal_num/next_packet_size
states are consistent.

> This change also does not seem to fit the description of the patch, probaby separate patch will be better.

These changes are needed to ensure basic functionalities are working
after enabling the multifd zero page format.

>
>>
>> +    }
>> +
>>      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>>
>>      if (ret != 0) {
>> @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
>>                     p->id, out_size, expected_size);
>>          return -1;
>>      }
>> +
>>      return 0;
>>  }
>>
>> diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
>> index dc8fe43e94..27a1eba075 100644
>> --- a/migration/multifd-zstd.c
>> +++ b/migration/multifd-zstd.c
>> @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>>      int ret;
>>      uint32_t i;
>>
>> +    multifd_zero_page_check_send(p);
>> +
>> +    if (!pages->normal_num) {
>> +        p->next_packet_size = 0;
>> +        goto out;
>> +    }
>> +
>>      multifd_send_prepare_header(p);
>>
>>      z->out.dst = z->zbuff;
>>      z->out.size = z->zbuff_len;
>>      z->out.pos = 0;
>>
>> -    for (i = 0; i < pages->num; i++) {
>> +    for (i = 0; i < pages->normal_num; i++) {
>>          ZSTD_EndDirective flush = ZSTD_e_continue;
>>
>> -        if (i == pages->num - 1) {
>> +        if (i == pages->normal_num - 1) {
>>              flush = ZSTD_e_flush;
>>          }
>> -        z->in.src = p->pages->block->host + pages->offset[i];
>> +        z->in.src = p->pages->block->host + pages->normal[i];
>>          z->in.size = p->page_size;
>>          z->in.pos = 0;
>>
>> @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>>      p->iov[p->iovs_num].iov_len = z->out.pos;
>>      p->iovs_num++;
>>      p->next_packet_size = z->out.pos;
>> -    p->flags |= MULTIFD_FLAG_ZSTD;
>>
>> +out:
>> +    p->flags |= MULTIFD_FLAG_ZSTD;
>>      multifd_send_fill_packet(p);
>> -
>
> Spurious removal.

Can you elaborate on this?

>
>>
>>      return 0;
>>  }
>>
>> @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
>>                     p->id, flags, MULTIFD_FLAG_ZSTD);
>>          return -1;
>>      }
>> +
>> +    multifd_zero_page_check_recv(p);
>> +
>> +    if (!p->normal_num) {
>> +        assert(in_size == 0);
>> +        return 0;
>> +    }
>> +
>
> Same question here about assert.
>
>>
>>      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>>
>>      if (ret != 0) {
>> diff --git a/migration/multifd.c b/migration/multifd.c
>> index a33dba40d9..fbb40ea10b 100644
>> --- a/migration/multifd.c
>> +++ b/migration/multifd.c
>> @@ -11,6 +11,7 @@
>>   */
>>
>>  #include "qemu/osdep.h"
>> +#include "qemu/cutils.h"
>>  #include "qemu/rcu.h"
>>  #include "exec/target_page.h"
>>  #include "sysemu/sysemu.h"
>> @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>>      MultiFDPages_t *pages = p->pages;
>>      int ret;
>>
>> +    multifd_zero_page_check_send(p);
>> +
>>      if (!use_zero_copy_send) {
>>          /*
>>           * Only !zerocopy needs the header in IOV; zerocopy will
>> @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>>          multifd_send_prepare_header(p);
>>      }
>>
>> -    for (int i = 0; i < pages->num; i++) {
>> -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
>> +    for (int i = 0; i < pages->normal_num; i++) {
>> +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
>>          p->iov[p->iovs_num].iov_len = p->page_size;
>>          p->iovs_num++;
>>      }
>>
>> -    p->next_packet_size = pages->num * p->page_size;
>> +    p->next_packet_size = pages->normal_num * p->page_size;
>>      p->flags |= MULTIFD_FLAG_NOCOMP;
>>
>>      multifd_send_fill_packet(p);
>> @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
>>                     p->id, flags, MULTIFD_FLAG_NOCOMP);
>>          return -1;
>>      }
>> +
>> +    multifd_zero_page_check_recv(p);
>> +
>> +    if (!p->normal_num) {
>> +        return 0;
>> +    }
>> +
>>      for (int i = 0; i < p->normal_num; i++) {
>>          p->iov[i].iov_base = p->host + p->normal[i];
>>          p->iov[i].iov_len = p->page_size;
>> @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>>
>>      packet->flags = cpu_to_be32(p->flags);
>>      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
>> -    packet->normal_pages = cpu_to_be32(pages->num);
>> +    packet->normal_pages = cpu_to_be32(pages->normal_num);
>>      packet->zero_pages = cpu_to_be32(pages->zero_num);
>>      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>>
>> @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>>          strncpy(packet->ramblock, pages->block->idstr, 256);
>>      }
>>
>> -    for (i = 0; i < pages->num; i++) {
>> +    for (i = 0; i < pages->normal_num; i++) {
>>          /* there are architectures where ram_addr_t is 32 bit */
>> -        uint64_t temp = pages->offset[i];
>> +        uint64_t temp = pages->normal[i];
>>
>>          packet->offset[i] = cpu_to_be64(temp);
>>      }
>>
>> +    for (i = 0; i < pages->zero_num; i++) {
>> +        /* there are architectures where ram_addr_t is 32 bit */
>> +        uint64_t temp = pages->zero[i];
>> +
>> +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
>> +    }
>> +
>>      p->packets_sent++;
>> -    p->total_normal_pages += pages->num;
>> +    p->total_normal_pages += pages->normal_num;
>>      p->total_zero_pages += pages->zero_num;
>>
>> -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
>> +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
>>                         p->flags, p->next_packet_size);
>>  }
>>
>> @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>>          p->normal[i] = offset;
>>      }
>>
>> +    for (i = 0; i < p->zero_num; i++) {
>> +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
>> +
>> +        if (offset > (p->block->used_length - p->page_size)) {
>> +            error_setg(errp, "multifd: offset too long %" PRIu64
>> +                       " (max " RAM_ADDR_FMT ")",
>> +                       offset, p->block->used_length);
>> +            return -1;
>> +        }
>> +        p->zero[i] = offset;
>> +    }
>> +
>>      return 0;
>>  }
>>
>> @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
>>
>>              stat64_add(&mig_stats.multifd_bytes,
>>                         p->next_packet_size + p->packet_len);
>> -            stat64_add(&mig_stats.normal_pages, pages->num);
>> +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
>>              stat64_add(&mig_stats.zero_pages, pages->zero_num);
>>
>>              multifd_pages_reset(p->pages);
>> @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
>>          p->flags &= ~MULTIFD_FLAG_SYNC;
>>          qemu_mutex_unlock(&p->mutex);
>>
>> -        if (p->normal_num) {
>> +        if (p->normal_num + p->zero_num) {
>> +            assert(!(flags & MULTIFD_FLAG_SYNC));
>
> This assertion seems to be not relevant to this patch. Could you post it separately and explain why it's needed here?

I thought this is nice to have because it ensures that sync packet and
data packet are mutually exclusive. I agree this is not needed for
things to work but does this deserve its own patch?

>
>>
>>              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
>>              if (ret != 0) {
>>                  break;
>> diff --git a/migration/multifd.h b/migration/multifd.h
>> index 9822ff298a..125f0bbe60 100644
>> --- a/migration/multifd.h
>> +++ b/migration/multifd.h
>> @@ -53,6 +53,11 @@ typedef struct {
>>      uint32_t unused32[1];    /* Reserved for future use */
>>      uint64_t unused64[3];    /* Reserved for future use */
>>      char ramblock[256];
>> +    /*
>> +     * This array contains the pointers to:
>> +     *  - normal pages (initial normal_pages entries)
>> +     *  - zero pages (following zero_pages entries)
>> +     */
>>      uint64_t offset[];
>>  } __attribute__((packed)) MultiFDPacket_t;
>>
>> @@ -224,6 +229,8 @@ typedef struct {
>>
>>  void multifd_register_ops(int method, MultiFDMethods *ops);
>>  void multifd_send_fill_packet(MultiFDSendParams *p);
>> +void multifd_zero_page_check_send(MultiFDSendParams *p);
>> +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
>>
>>  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
>>  {
>> diff --git a/qapi/migration.json b/qapi/migration.json
>> index 99843a8e95..e2450b92d4 100644
>> --- a/qapi/migration.json
>> +++ b/qapi/migration.json
>> @@ -660,9 +660,11 @@
>>  #
>>  # @none: Do not perform zero page checking.
>>  #
>> +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
>> +#
>>  ##
>>  { 'enum': 'ZeroPageDetection',
>> -  'data': [ 'legacy', 'none' ] }
>> +  'data': [ 'legacy', 'none', 'multifd' ] }
>>
>>  ##
>>  # @BitmapMigrationBitmapAliasTransform:
>> --
>> 2.30.2
>>
>>
>
>
> --
> Elena
Hao Xiang Feb. 23, 2024, 5:15 a.m. UTC | #8
On Thu, Feb 22, 2024 at 6:21 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Feb 21, 2024 at 06:04:10PM -0300, Fabiano Rosas wrote:
> > Hao Xiang <hao.xiang@bytedance.com> writes:
> >
> > > 1. Implements the zero page detection and handling on the multifd
> > > threads for non-compression, zlib and zstd compression backends.
> > > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > > in all scenarios.
> > >
> > > Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> > > ---
> > >  migration/meson.build         |  1 +
> > >  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
> > >  migration/multifd-zlib.c      | 26 ++++++++++++---
> > >  migration/multifd-zstd.c      | 25 ++++++++++++---
> > >  migration/multifd.c           | 50 +++++++++++++++++++++++------
> > >  migration/multifd.h           |  7 +++++
> > >  qapi/migration.json           |  4 ++-
> > >  7 files changed, 151 insertions(+), 21 deletions(-)
> > >  create mode 100644 migration/multifd-zero-page.c
> > >
> > > diff --git a/migration/meson.build b/migration/meson.build
> > > index 92b1cc4297..1eeb915ff6 100644
> > > --- a/migration/meson.build
> > > +++ b/migration/meson.build
> > > @@ -22,6 +22,7 @@ system_ss.add(files(
> > >    'migration.c',
> > >    'multifd.c',
> > >    'multifd-zlib.c',
> > > +  'multifd-zero-page.c',
> > >    'ram-compress.c',
> > >    'options.c',
> > >    'postcopy-ram.c',
> > > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > > new file mode 100644
> > > index 0000000000..f0cd8e2c53
> > > --- /dev/null
> > > +++ b/migration/multifd-zero-page.c
> > > @@ -0,0 +1,59 @@
> > > +/*
> > > + * Multifd zero page detection implementation.
> > > + *
> > > + * Copyright (c) 2024 Bytedance Inc
> > > + *
> > > + * Authors:
> > > + *  Hao Xiang <hao.xiang@bytedance.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > + * See the COPYING file in the top-level directory.
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "qemu/cutils.h"
> > > +#include "exec/ramblock.h"
> > > +#include "migration.h"
> > > +#include "multifd.h"
> > > +#include "options.h"
> > > +#include "ram.h"
> > > +
> > > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > > +{
> > > +    /*
> > > +     * QEMU older than 9.0 don't understand zero page
> > > +     * on multifd channel. This switch is required to
> > > +     * maintain backward compatibility.
> > > +     */
> > > +    bool use_multifd_zero_page =
> > > +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > > +    MultiFDPages_t *pages = p->pages;
> > > +    RAMBlock *rb = pages->block;
> > > +
> > > +    assert(pages->num != 0);
> > > +    assert(pages->normal_num == 0);
> > > +    assert(pages->zero_num == 0);
> >
> > We can drop these before the final version.
> >
> > > +
> > > +    for (int i = 0; i < pages->num; i++) {
> > > +        uint64_t offset = pages->offset[i];
> > > +        if (use_multifd_zero_page &&
> > > +            buffer_is_zero(rb->host + offset, p->page_size)) {
> > > +            pages->zero[pages->zero_num] = offset;
> > > +            pages->zero_num++;
> > > +            ram_release_page(rb->idstr, offset);
> > > +        } else {
> > > +            pages->normal[pages->normal_num] = offset;
> > > +            pages->normal_num++;
> > > +        }
> > > +    }
> >
> > I don't think it's super clean to have three arrays offset, zero and
> > normal, all sized for the full packet size. It might be possible to just
> > carry a bitmap of non-zero pages along with pages->offset and operate on
> > that instead.
> >
> > What do you think?
> >
> > Peter, any ideas? Should we just leave this for another time?
>
> Yeah I think a bitmap should save quite a few fields indeed, it'll however
> make the latter iteration slightly harder by walking both (offset[],
> bitmap), process the page only if bitmap is set for the offset.
>
> IIUC we perhaps don't even need a bitmap?  AFAIU what we only need in
> Multifdpages_t is one extra field to mark "how many normal pages", aka,
> normal_num here (zero_num can be calculated from num-normal_num).  Then
> the zero page detection logic should do two things:
>
>   - Sort offset[] array so that it starts with normal pages, followed up by
>     zero pages
>
>   - Setup normal_num to be the number of normal pages
>
> Then we reduce 2 new arrays (normal[], zero[]) + 2 new fields (normal_num,
> zero_num) -> 1 new field (normal_num).  It'll also be trivial to fill the
> packet header later because offset[] is exactly that.
>
> Side note - I still think it's confusing to read this patch and previous
> patch separately.  Obviously previous patch introduced these new fields
> without justifying their values yet.  IMHO it'll be easier to review if you
> merge the two patches.

Fabiano, thanks for catching this. I totally missed the backward
compatibility thing.
Peter, I will code the sorting and merge this patch with the previous one.

>
> >
> > > +}
> > > +
> > > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > > +{
> > > +    for (int i = 0; i < p->zero_num; i++) {
> > > +        void *page = p->host + p->zero[i];
> > > +        if (!buffer_is_zero(page, p->page_size)) {
> > > +            memset(page, 0, p->page_size);
> > > +        }
> > > +    }
> > > +}
> > > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > > index 012e3bdea1..cdfe0fa70e 100644
> > > --- a/migration/multifd-zlib.c
> > > +++ b/migration/multifd-zlib.c
> > > @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> > >      int ret;
> > >      uint32_t i;
> > >
> > > +    multifd_zero_page_check_send(p);
> > > +
> > > +    if (!pages->normal_num) {
> > > +        p->next_packet_size = 0;
> > > +        goto out;
> > > +    }
> > > +
> > >      multifd_send_prepare_header(p);
> > >
> > > -    for (i = 0; i < pages->num; i++) {
> > > +    for (i = 0; i < pages->normal_num; i++) {
> > >          uint32_t available = z->zbuff_len - out_size;
> > >          int flush = Z_NO_FLUSH;
> > >
> > > -        if (i == pages->num - 1) {
> > > +        if (i == pages->normal_num - 1) {
> > >              flush = Z_SYNC_FLUSH;
> > >          }
> > >
> > > @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> > >           * with compression. zlib does not guarantee that this is safe,
> > >           * therefore copy the page before calling deflate().
> > >           */
> > > -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
> > > +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
> > >          zs->avail_in = p->page_size;
> > >          zs->next_in = z->buf;
> > >
> > > @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> > >      p->iov[p->iovs_num].iov_len = out_size;
> > >      p->iovs_num++;
> > >      p->next_packet_size = out_size;
> > > -    p->flags |= MULTIFD_FLAG_ZLIB;
> > >
> > > +out:
> > > +    p->flags |= MULTIFD_FLAG_ZLIB;
> > >      multifd_send_fill_packet(p);
> > > -
> > >      return 0;
> > >  }
> > >
> > > @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> > >                     p->id, flags, MULTIFD_FLAG_ZLIB);
> > >          return -1;
> > >      }
> > > +
> > > +    multifd_zero_page_check_recv(p);
> > > +
> > > +    if (!p->normal_num) {
> > > +        assert(in_size == 0);
> > > +        return 0;
> > > +    }
> > > +
> > >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> > >
> > >      if (ret != 0) {
> > > @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> > >                     p->id, out_size, expected_size);
> > >          return -1;
> > >      }
> > > +
> > >      return 0;
> > >  }
> > >
> > > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > > index dc8fe43e94..27a1eba075 100644
> > > --- a/migration/multifd-zstd.c
> > > +++ b/migration/multifd-zstd.c
> > > @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> > >      int ret;
> > >      uint32_t i;
> > >
> > > +    multifd_zero_page_check_send(p);
> > > +
> > > +    if (!pages->normal_num) {
> > > +        p->next_packet_size = 0;
> > > +        goto out;
> > > +    }
> > > +
> > >      multifd_send_prepare_header(p);
>
> If this forms a pattern we can introduce multifd_send_prepare_common():

I will add that in the next version.

>
> bool multifd_send_prepare_common()
> {
>     multifd_zero_page_check_send();
>     if (...) {
>
>     }
>     multifd_send_prepare_header();
> }
>
> > >
> > >      z->out.dst = z->zbuff;
> > >      z->out.size = z->zbuff_len;
> > >      z->out.pos = 0;
> > >
> > > -    for (i = 0; i < pages->num; i++) {
> > > +    for (i = 0; i < pages->normal_num; i++) {
> > >          ZSTD_EndDirective flush = ZSTD_e_continue;
> > >
> > > -        if (i == pages->num - 1) {
> > > +        if (i == pages->normal_num - 1) {
> > >              flush = ZSTD_e_flush;
> > >          }
> > > -        z->in.src = p->pages->block->host + pages->offset[i];
> > > +        z->in.src = p->pages->block->host + pages->normal[i];
> > >          z->in.size = p->page_size;
> > >          z->in.pos = 0;
> > >
> > > @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> > >      p->iov[p->iovs_num].iov_len = z->out.pos;
> > >      p->iovs_num++;
> > >      p->next_packet_size = z->out.pos;
> > > -    p->flags |= MULTIFD_FLAG_ZSTD;
> > >
> > > +out:
> > > +    p->flags |= MULTIFD_FLAG_ZSTD;
> > >      multifd_send_fill_packet(p);
> > > -
> > >      return 0;
> > >  }
> > >
> > > @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
> > >                     p->id, flags, MULTIFD_FLAG_ZSTD);
> > >          return -1;
> > >      }
> > > +
> > > +    multifd_zero_page_check_recv(p);
> > > +
> > > +    if (!p->normal_num) {
> > > +        assert(in_size == 0);
> > > +        return 0;
> > > +    }
> > > +
> > >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> > >
> > >      if (ret != 0) {
> > > diff --git a/migration/multifd.c b/migration/multifd.c
> > > index a33dba40d9..fbb40ea10b 100644
> > > --- a/migration/multifd.c
> > > +++ b/migration/multifd.c
> > > @@ -11,6 +11,7 @@
> > >   */
> > >
> > >  #include "qemu/osdep.h"
> > > +#include "qemu/cutils.h"
> > >  #include "qemu/rcu.h"
> > >  #include "exec/target_page.h"
> > >  #include "sysemu/sysemu.h"
> > > @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> > >      MultiFDPages_t *pages = p->pages;
> > >      int ret;
> > >
> > > +    multifd_zero_page_check_send(p);
> > > +
> > >      if (!use_zero_copy_send) {
> > >          /*
> > >           * Only !zerocopy needs the header in IOV; zerocopy will
> > > @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> > >          multifd_send_prepare_header(p);
> > >      }
> > >
> > > -    for (int i = 0; i < pages->num; i++) {
> > > -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
> > > +    for (int i = 0; i < pages->normal_num; i++) {
> > > +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
> > >          p->iov[p->iovs_num].iov_len = p->page_size;
> > >          p->iovs_num++;
> > >      }
> > >
> > > -    p->next_packet_size = pages->num * p->page_size;
> > > +    p->next_packet_size = pages->normal_num * p->page_size;
> > >      p->flags |= MULTIFD_FLAG_NOCOMP;
> > >
> > >      multifd_send_fill_packet(p);
> > > @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
> > >                     p->id, flags, MULTIFD_FLAG_NOCOMP);
> > >          return -1;
> > >      }
> > > +
> > > +    multifd_zero_page_check_recv(p);
> > > +
> > > +    if (!p->normal_num) {
> > > +        return 0;
> > > +    }
> > > +
> > >      for (int i = 0; i < p->normal_num; i++) {
> > >          p->iov[i].iov_base = p->host + p->normal[i];
> > >          p->iov[i].iov_len = p->page_size;
> > > @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> > >
> > >      packet->flags = cpu_to_be32(p->flags);
> > >      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
> > > -    packet->normal_pages = cpu_to_be32(pages->num);
> > > +    packet->normal_pages = cpu_to_be32(pages->normal_num);
> > >      packet->zero_pages = cpu_to_be32(pages->zero_num);
> > >      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
> > >
> > > @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> > >          strncpy(packet->ramblock, pages->block->idstr, 256);
> > >      }
> > >
> > > -    for (i = 0; i < pages->num; i++) {
> > > +    for (i = 0; i < pages->normal_num; i++) {
> > >          /* there are architectures where ram_addr_t is 32 bit */
> > > -        uint64_t temp = pages->offset[i];
> > > +        uint64_t temp = pages->normal[i];
> > >
> > >          packet->offset[i] = cpu_to_be64(temp);
> > >      }
> > >
> > > +    for (i = 0; i < pages->zero_num; i++) {
> > > +        /* there are architectures where ram_addr_t is 32 bit */
> > > +        uint64_t temp = pages->zero[i];
> > > +
> > > +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
> > > +    }
> > > +
> > >      p->packets_sent++;
> > > -    p->total_normal_pages += pages->num;
> > > +    p->total_normal_pages += pages->normal_num;
> > >      p->total_zero_pages += pages->zero_num;
> > >
> > > -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> > > +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
> > >                         p->flags, p->next_packet_size);
> > >  }
> > >
> > > @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> > >          p->normal[i] = offset;
> > >      }
> > >
> > > +    for (i = 0; i < p->zero_num; i++) {
> > > +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> > > +
> > > +        if (offset > (p->block->used_length - p->page_size)) {
> > > +            error_setg(errp, "multifd: offset too long %" PRIu64
> > > +                       " (max " RAM_ADDR_FMT ")",
> > > +                       offset, p->block->used_length);
> > > +            return -1;
> > > +        }
> > > +        p->zero[i] = offset;
> > > +    }
> > > +
> > >      return 0;
> > >  }
> > >
> > > @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
> > >
> > >              stat64_add(&mig_stats.multifd_bytes,
> > >                         p->next_packet_size + p->packet_len);
> > > -            stat64_add(&mig_stats.normal_pages, pages->num);
> > > +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
> > >              stat64_add(&mig_stats.zero_pages, pages->zero_num);
> > >
> > >              multifd_pages_reset(p->pages);
> > > @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
> > >          p->flags &= ~MULTIFD_FLAG_SYNC;
> > >          qemu_mutex_unlock(&p->mutex);
> > >
> > > -        if (p->normal_num) {
> > > +        if (p->normal_num + p->zero_num) {
> > > +            assert(!(flags & MULTIFD_FLAG_SYNC));
> >
> > This breaks 8.2 -> 9.0 migration. QEMU 8.2 is still sending the SYNC
> > along with the data packet.
> >
> > >              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
> > >              if (ret != 0) {
> > >                  break;
> > > diff --git a/migration/multifd.h b/migration/multifd.h
> > > index 9822ff298a..125f0bbe60 100644
> > > --- a/migration/multifd.h
> > > +++ b/migration/multifd.h
> > > @@ -53,6 +53,11 @@ typedef struct {
> > >      uint32_t unused32[1];    /* Reserved for future use */
> > >      uint64_t unused64[3];    /* Reserved for future use */
> > >      char ramblock[256];
> > > +    /*
> > > +     * This array contains the pointers to:
> > > +     *  - normal pages (initial normal_pages entries)
> > > +     *  - zero pages (following zero_pages entries)
> > > +     */
> > >      uint64_t offset[];
> > >  } __attribute__((packed)) MultiFDPacket_t;
> > >
> > > @@ -224,6 +229,8 @@ typedef struct {
> > >
> > >  void multifd_register_ops(int method, MultiFDMethods *ops);
> > >  void multifd_send_fill_packet(MultiFDSendParams *p);
> > > +void multifd_zero_page_check_send(MultiFDSendParams *p);
> > > +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
> > >
> > >  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
> > >  {
> > > diff --git a/qapi/migration.json b/qapi/migration.json
> > > index 99843a8e95..e2450b92d4 100644
> > > --- a/qapi/migration.json
> > > +++ b/qapi/migration.json
> > > @@ -660,9 +660,11 @@
> > >  #
> > >  # @none: Do not perform zero page checking.
> > >  #
> > > +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
> > > +#
> > >  ##
> > >  { 'enum': 'ZeroPageDetection',
> > > -  'data': [ 'legacy', 'none' ] }
> > > +  'data': [ 'legacy', 'none', 'multifd' ] }
> > >
> > >  ##
> > >  # @BitmapMigrationBitmapAliasTransform:
> >
>
> --
> Peter Xu
>
Hao Xiang Feb. 23, 2024, 5:18 a.m. UTC | #9
On Wed, Feb 21, 2024 at 1:04 PM Fabiano Rosas <farosas@suse.de> wrote:
>
> Hao Xiang <hao.xiang@bytedance.com> writes:
>
> > 1. Implements the zero page detection and handling on the multifd
> > threads for non-compression, zlib and zstd compression backends.
> > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > in all scenarios.
> >
> > Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> > ---
> >  migration/meson.build         |  1 +
> >  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
> >  migration/multifd-zlib.c      | 26 ++++++++++++---
> >  migration/multifd-zstd.c      | 25 ++++++++++++---
> >  migration/multifd.c           | 50 +++++++++++++++++++++++------
> >  migration/multifd.h           |  7 +++++
> >  qapi/migration.json           |  4 ++-
> >  7 files changed, 151 insertions(+), 21 deletions(-)
> >  create mode 100644 migration/multifd-zero-page.c
> >
> > diff --git a/migration/meson.build b/migration/meson.build
> > index 92b1cc4297..1eeb915ff6 100644
> > --- a/migration/meson.build
> > +++ b/migration/meson.build
> > @@ -22,6 +22,7 @@ system_ss.add(files(
> >    'migration.c',
> >    'multifd.c',
> >    'multifd-zlib.c',
> > +  'multifd-zero-page.c',
> >    'ram-compress.c',
> >    'options.c',
> >    'postcopy-ram.c',
> > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > new file mode 100644
> > index 0000000000..f0cd8e2c53
> > --- /dev/null
> > +++ b/migration/multifd-zero-page.c
> > @@ -0,0 +1,59 @@
> > +/*
> > + * Multifd zero page detection implementation.
> > + *
> > + * Copyright (c) 2024 Bytedance Inc
> > + *
> > + * Authors:
> > + *  Hao Xiang <hao.xiang@bytedance.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + */
> > +
> > +#include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> > +#include "exec/ramblock.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "options.h"
> > +#include "ram.h"
> > +
> > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > +{
> > +    /*
> > +     * QEMU older than 9.0 don't understand zero page
> > +     * on multifd channel. This switch is required to
> > +     * maintain backward compatibility.
> > +     */
> > +    bool use_multifd_zero_page =
> > +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > +    MultiFDPages_t *pages = p->pages;
> > +    RAMBlock *rb = pages->block;
> > +
> > +    assert(pages->num != 0);
> > +    assert(pages->normal_num == 0);
> > +    assert(pages->zero_num == 0);
>
> We can drop these before the final version.

Elena has the same concern. I will drop these.

>
> > +
> > +    for (int i = 0; i < pages->num; i++) {
> > +        uint64_t offset = pages->offset[i];
> > +        if (use_multifd_zero_page &&
> > +            buffer_is_zero(rb->host + offset, p->page_size)) {
> > +            pages->zero[pages->zero_num] = offset;
> > +            pages->zero_num++;
> > +            ram_release_page(rb->idstr, offset);
> > +        } else {
> > +            pages->normal[pages->normal_num] = offset;
> > +            pages->normal_num++;
> > +        }
> > +    }
>
> I don't think it's super clean to have three arrays offset, zero and
> normal, all sized for the full packet size. It might be possible to just
> carry a bitmap of non-zero pages along with pages->offset and operate on
> that instead.
>
> What do you think?
>
> Peter, any ideas? Should we just leave this for another time?
>
> > +}
> > +
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > +{
> > +    for (int i = 0; i < p->zero_num; i++) {
> > +        void *page = p->host + p->zero[i];
> > +        if (!buffer_is_zero(page, p->page_size)) {
> > +            memset(page, 0, p->page_size);
> > +        }
> > +    }
> > +}
> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > index 012e3bdea1..cdfe0fa70e 100644
> > --- a/migration/multifd-zlib.c
> > +++ b/migration/multifd-zlib.c
> > @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> >      int ret;
> >      uint32_t i;
> >
> > +    multifd_zero_page_check_send(p);
> > +
> > +    if (!pages->normal_num) {
> > +        p->next_packet_size = 0;
> > +        goto out;
> > +    }
> > +
> >      multifd_send_prepare_header(p);
> >
> > -    for (i = 0; i < pages->num; i++) {
> > +    for (i = 0; i < pages->normal_num; i++) {
> >          uint32_t available = z->zbuff_len - out_size;
> >          int flush = Z_NO_FLUSH;
> >
> > -        if (i == pages->num - 1) {
> > +        if (i == pages->normal_num - 1) {
> >              flush = Z_SYNC_FLUSH;
> >          }
> >
> > @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> >           * with compression. zlib does not guarantee that this is safe,
> >           * therefore copy the page before calling deflate().
> >           */
> > -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
> > +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
> >          zs->avail_in = p->page_size;
> >          zs->next_in = z->buf;
> >
> > @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> >      p->iov[p->iovs_num].iov_len = out_size;
> >      p->iovs_num++;
> >      p->next_packet_size = out_size;
> > -    p->flags |= MULTIFD_FLAG_ZLIB;
> >
> > +out:
> > +    p->flags |= MULTIFD_FLAG_ZLIB;
> >      multifd_send_fill_packet(p);
> > -
> >      return 0;
> >  }
> >
> > @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, flags, MULTIFD_FLAG_ZLIB);
> >          return -1;
> >      }
> > +
> > +    multifd_zero_page_check_recv(p);
> > +
> > +    if (!p->normal_num) {
> > +        assert(in_size == 0);
> > +        return 0;
> > +    }
> > +
> >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> >
> >      if (ret != 0) {
> > @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, out_size, expected_size);
> >          return -1;
> >      }
> > +
> >      return 0;
> >  }
> >
> > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > index dc8fe43e94..27a1eba075 100644
> > --- a/migration/multifd-zstd.c
> > +++ b/migration/multifd-zstd.c
> > @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> >      int ret;
> >      uint32_t i;
> >
> > +    multifd_zero_page_check_send(p);
> > +
> > +    if (!pages->normal_num) {
> > +        p->next_packet_size = 0;
> > +        goto out;
> > +    }
> > +
> >      multifd_send_prepare_header(p);
> >
> >      z->out.dst = z->zbuff;
> >      z->out.size = z->zbuff_len;
> >      z->out.pos = 0;
> >
> > -    for (i = 0; i < pages->num; i++) {
> > +    for (i = 0; i < pages->normal_num; i++) {
> >          ZSTD_EndDirective flush = ZSTD_e_continue;
> >
> > -        if (i == pages->num - 1) {
> > +        if (i == pages->normal_num - 1) {
> >              flush = ZSTD_e_flush;
> >          }
> > -        z->in.src = p->pages->block->host + pages->offset[i];
> > +        z->in.src = p->pages->block->host + pages->normal[i];
> >          z->in.size = p->page_size;
> >          z->in.pos = 0;
> >
> > @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> >      p->iov[p->iovs_num].iov_len = z->out.pos;
> >      p->iovs_num++;
> >      p->next_packet_size = z->out.pos;
> > -    p->flags |= MULTIFD_FLAG_ZSTD;
> >
> > +out:
> > +    p->flags |= MULTIFD_FLAG_ZSTD;
> >      multifd_send_fill_packet(p);
> > -
> >      return 0;
> >  }
> >
> > @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, flags, MULTIFD_FLAG_ZSTD);
> >          return -1;
> >      }
> > +
> > +    multifd_zero_page_check_recv(p);
> > +
> > +    if (!p->normal_num) {
> > +        assert(in_size == 0);
> > +        return 0;
> > +    }
> > +
> >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> >
> >      if (ret != 0) {
> > diff --git a/migration/multifd.c b/migration/multifd.c
> > index a33dba40d9..fbb40ea10b 100644
> > --- a/migration/multifd.c
> > +++ b/migration/multifd.c
> > @@ -11,6 +11,7 @@
> >   */
> >
> >  #include "qemu/osdep.h"
> > +#include "qemu/cutils.h"
> >  #include "qemu/rcu.h"
> >  #include "exec/target_page.h"
> >  #include "sysemu/sysemu.h"
> > @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> >      MultiFDPages_t *pages = p->pages;
> >      int ret;
> >
> > +    multifd_zero_page_check_send(p);
> > +
> >      if (!use_zero_copy_send) {
> >          /*
> >           * Only !zerocopy needs the header in IOV; zerocopy will
> > @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> >          multifd_send_prepare_header(p);
> >      }
> >
> > -    for (int i = 0; i < pages->num; i++) {
> > -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
> > +    for (int i = 0; i < pages->normal_num; i++) {
> > +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
> >          p->iov[p->iovs_num].iov_len = p->page_size;
> >          p->iovs_num++;
> >      }
> >
> > -    p->next_packet_size = pages->num * p->page_size;
> > +    p->next_packet_size = pages->normal_num * p->page_size;
> >      p->flags |= MULTIFD_FLAG_NOCOMP;
> >
> >      multifd_send_fill_packet(p);
> > @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
> >                     p->id, flags, MULTIFD_FLAG_NOCOMP);
> >          return -1;
> >      }
> > +
> > +    multifd_zero_page_check_recv(p);
> > +
> > +    if (!p->normal_num) {
> > +        return 0;
> > +    }
> > +
> >      for (int i = 0; i < p->normal_num; i++) {
> >          p->iov[i].iov_base = p->host + p->normal[i];
> >          p->iov[i].iov_len = p->page_size;
> > @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> >
> >      packet->flags = cpu_to_be32(p->flags);
> >      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
> > -    packet->normal_pages = cpu_to_be32(pages->num);
> > +    packet->normal_pages = cpu_to_be32(pages->normal_num);
> >      packet->zero_pages = cpu_to_be32(pages->zero_num);
> >      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
> >
> > @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> >          strncpy(packet->ramblock, pages->block->idstr, 256);
> >      }
> >
> > -    for (i = 0; i < pages->num; i++) {
> > +    for (i = 0; i < pages->normal_num; i++) {
> >          /* there are architectures where ram_addr_t is 32 bit */
> > -        uint64_t temp = pages->offset[i];
> > +        uint64_t temp = pages->normal[i];
> >
> >          packet->offset[i] = cpu_to_be64(temp);
> >      }
> >
> > +    for (i = 0; i < pages->zero_num; i++) {
> > +        /* there are architectures where ram_addr_t is 32 bit */
> > +        uint64_t temp = pages->zero[i];
> > +
> > +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
> > +    }
> > +
> >      p->packets_sent++;
> > -    p->total_normal_pages += pages->num;
> > +    p->total_normal_pages += pages->normal_num;
> >      p->total_zero_pages += pages->zero_num;
> >
> > -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> > +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
> >                         p->flags, p->next_packet_size);
> >  }
> >
> > @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> >          p->normal[i] = offset;
> >      }
> >
> > +    for (i = 0; i < p->zero_num; i++) {
> > +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> > +
> > +        if (offset > (p->block->used_length - p->page_size)) {
> > +            error_setg(errp, "multifd: offset too long %" PRIu64
> > +                       " (max " RAM_ADDR_FMT ")",
> > +                       offset, p->block->used_length);
> > +            return -1;
> > +        }
> > +        p->zero[i] = offset;
> > +    }
> > +
> >      return 0;
> >  }
> >
> > @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
> >
> >              stat64_add(&mig_stats.multifd_bytes,
> >                         p->next_packet_size + p->packet_len);
> > -            stat64_add(&mig_stats.normal_pages, pages->num);
> > +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
> >              stat64_add(&mig_stats.zero_pages, pages->zero_num);
> >
> >              multifd_pages_reset(p->pages);
> > @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
> >          p->flags &= ~MULTIFD_FLAG_SYNC;
> >          qemu_mutex_unlock(&p->mutex);
> >
> > -        if (p->normal_num) {
> > +        if (p->normal_num + p->zero_num) {
> > +            assert(!(flags & MULTIFD_FLAG_SYNC));
>
> This breaks 8.2 -> 9.0 migration. QEMU 8.2 is still sending the SYNC
> along with the data packet.

I keep missing the compatibility thing. Will remove this.

>
> >              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
> >              if (ret != 0) {
> >                  break;
> > diff --git a/migration/multifd.h b/migration/multifd.h
> > index 9822ff298a..125f0bbe60 100644
> > --- a/migration/multifd.h
> > +++ b/migration/multifd.h
> > @@ -53,6 +53,11 @@ typedef struct {
> >      uint32_t unused32[1];    /* Reserved for future use */
> >      uint64_t unused64[3];    /* Reserved for future use */
> >      char ramblock[256];
> > +    /*
> > +     * This array contains the pointers to:
> > +     *  - normal pages (initial normal_pages entries)
> > +     *  - zero pages (following zero_pages entries)
> > +     */
> >      uint64_t offset[];
> >  } __attribute__((packed)) MultiFDPacket_t;
> >
> > @@ -224,6 +229,8 @@ typedef struct {
> >
> >  void multifd_register_ops(int method, MultiFDMethods *ops);
> >  void multifd_send_fill_packet(MultiFDSendParams *p);
> > +void multifd_zero_page_check_send(MultiFDSendParams *p);
> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
> >
> >  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
> >  {
> > diff --git a/qapi/migration.json b/qapi/migration.json
> > index 99843a8e95..e2450b92d4 100644
> > --- a/qapi/migration.json
> > +++ b/qapi/migration.json
> > @@ -660,9 +660,11 @@
> >  #
> >  # @none: Do not perform zero page checking.
> >  #
> > +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
> > +#
> >  ##
> >  { 'enum': 'ZeroPageDetection',
> > -  'data': [ 'legacy', 'none' ] }
> > +  'data': [ 'legacy', 'none', 'multifd' ] }
> >
> >  ##
> >  # @BitmapMigrationBitmapAliasTransform:
Fabiano Rosas Feb. 23, 2024, 2:47 p.m. UTC | #10
Hao Xiang <hao.xiang@bytedance.com> writes:

> On Wed, Feb 21, 2024 at 1:04 PM Fabiano Rosas <farosas@suse.de> wrote:
>>
>> Hao Xiang <hao.xiang@bytedance.com> writes:
>>
>> > 1. Implements the zero page detection and handling on the multifd
>> > threads for non-compression, zlib and zstd compression backends.
>> > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
>> > 3. Add proper asserts to ensure pages->normal are used for normal pages
>> > in all scenarios.
>> >
>> > Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
>> > ---
>> >  migration/meson.build         |  1 +
>> >  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
>> >  migration/multifd-zlib.c      | 26 ++++++++++++---
>> >  migration/multifd-zstd.c      | 25 ++++++++++++---
>> >  migration/multifd.c           | 50 +++++++++++++++++++++++------
>> >  migration/multifd.h           |  7 +++++
>> >  qapi/migration.json           |  4 ++-
>> >  7 files changed, 151 insertions(+), 21 deletions(-)
>> >  create mode 100644 migration/multifd-zero-page.c
>> >
>> > diff --git a/migration/meson.build b/migration/meson.build
>> > index 92b1cc4297..1eeb915ff6 100644
>> > --- a/migration/meson.build
>> > +++ b/migration/meson.build
>> > @@ -22,6 +22,7 @@ system_ss.add(files(
>> >    'migration.c',
>> >    'multifd.c',
>> >    'multifd-zlib.c',
>> > +  'multifd-zero-page.c',
>> >    'ram-compress.c',
>> >    'options.c',
>> >    'postcopy-ram.c',
>> > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
>> > new file mode 100644
>> > index 0000000000..f0cd8e2c53
>> > --- /dev/null
>> > +++ b/migration/multifd-zero-page.c
>> > @@ -0,0 +1,59 @@
>> > +/*
>> > + * Multifd zero page detection implementation.
>> > + *
>> > + * Copyright (c) 2024 Bytedance Inc
>> > + *
>> > + * Authors:
>> > + *  Hao Xiang <hao.xiang@bytedance.com>
>> > + *
>> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
>> > + * See the COPYING file in the top-level directory.
>> > + */
>> > +
>> > +#include "qemu/osdep.h"
>> > +#include "qemu/cutils.h"
>> > +#include "exec/ramblock.h"
>> > +#include "migration.h"
>> > +#include "multifd.h"
>> > +#include "options.h"
>> > +#include "ram.h"
>> > +
>> > +void multifd_zero_page_check_send(MultiFDSendParams *p)
>> > +{
>> > +    /*
>> > +     * QEMU older than 9.0 don't understand zero page
>> > +     * on multifd channel. This switch is required to
>> > +     * maintain backward compatibility.
>> > +     */
>> > +    bool use_multifd_zero_page =
>> > +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
>> > +    MultiFDPages_t *pages = p->pages;
>> > +    RAMBlock *rb = pages->block;
>> > +
>> > +    assert(pages->num != 0);
>> > +    assert(pages->normal_num == 0);
>> > +    assert(pages->zero_num == 0);
>>
>> We can drop these before the final version.
>
> Elena has the same concern. I will drop these.
>
>>
>> > +
>> > +    for (int i = 0; i < pages->num; i++) {
>> > +        uint64_t offset = pages->offset[i];
>> > +        if (use_multifd_zero_page &&
>> > +            buffer_is_zero(rb->host + offset, p->page_size)) {
>> > +            pages->zero[pages->zero_num] = offset;
>> > +            pages->zero_num++;
>> > +            ram_release_page(rb->idstr, offset);
>> > +        } else {
>> > +            pages->normal[pages->normal_num] = offset;
>> > +            pages->normal_num++;
>> > +        }
>> > +    }
>>
>> I don't think it's super clean to have three arrays offset, zero and
>> normal, all sized for the full packet size. It might be possible to just
>> carry a bitmap of non-zero pages along with pages->offset and operate on
>> that instead.
>>
>> What do you think?
>>
>> Peter, any ideas? Should we just leave this for another time?
>>
>> > +}
>> > +
>> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
>> > +{
>> > +    for (int i = 0; i < p->zero_num; i++) {
>> > +        void *page = p->host + p->zero[i];
>> > +        if (!buffer_is_zero(page, p->page_size)) {
>> > +            memset(page, 0, p->page_size);
>> > +        }
>> > +    }
>> > +}
>> > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
>> > index 012e3bdea1..cdfe0fa70e 100644
>> > --- a/migration/multifd-zlib.c
>> > +++ b/migration/multifd-zlib.c
>> > @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>> >      int ret;
>> >      uint32_t i;
>> >
>> > +    multifd_zero_page_check_send(p);
>> > +
>> > +    if (!pages->normal_num) {
>> > +        p->next_packet_size = 0;
>> > +        goto out;
>> > +    }
>> > +
>> >      multifd_send_prepare_header(p);
>> >
>> > -    for (i = 0; i < pages->num; i++) {
>> > +    for (i = 0; i < pages->normal_num; i++) {
>> >          uint32_t available = z->zbuff_len - out_size;
>> >          int flush = Z_NO_FLUSH;
>> >
>> > -        if (i == pages->num - 1) {
>> > +        if (i == pages->normal_num - 1) {
>> >              flush = Z_SYNC_FLUSH;
>> >          }
>> >
>> > @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>> >           * with compression. zlib does not guarantee that this is safe,
>> >           * therefore copy the page before calling deflate().
>> >           */
>> > -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
>> > +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
>> >          zs->avail_in = p->page_size;
>> >          zs->next_in = z->buf;
>> >
>> > @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
>> >      p->iov[p->iovs_num].iov_len = out_size;
>> >      p->iovs_num++;
>> >      p->next_packet_size = out_size;
>> > -    p->flags |= MULTIFD_FLAG_ZLIB;
>> >
>> > +out:
>> > +    p->flags |= MULTIFD_FLAG_ZLIB;
>> >      multifd_send_fill_packet(p);
>> > -
>> >      return 0;
>> >  }
>> >
>> > @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
>> >                     p->id, flags, MULTIFD_FLAG_ZLIB);
>> >          return -1;
>> >      }
>> > +
>> > +    multifd_zero_page_check_recv(p);
>> > +
>> > +    if (!p->normal_num) {
>> > +        assert(in_size == 0);
>> > +        return 0;
>> > +    }
>> > +
>> >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>> >
>> >      if (ret != 0) {
>> > @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
>> >                     p->id, out_size, expected_size);
>> >          return -1;
>> >      }
>> > +
>> >      return 0;
>> >  }
>> >
>> > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
>> > index dc8fe43e94..27a1eba075 100644
>> > --- a/migration/multifd-zstd.c
>> > +++ b/migration/multifd-zstd.c
>> > @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>> >      int ret;
>> >      uint32_t i;
>> >
>> > +    multifd_zero_page_check_send(p);
>> > +
>> > +    if (!pages->normal_num) {
>> > +        p->next_packet_size = 0;
>> > +        goto out;
>> > +    }
>> > +
>> >      multifd_send_prepare_header(p);
>> >
>> >      z->out.dst = z->zbuff;
>> >      z->out.size = z->zbuff_len;
>> >      z->out.pos = 0;
>> >
>> > -    for (i = 0; i < pages->num; i++) {
>> > +    for (i = 0; i < pages->normal_num; i++) {
>> >          ZSTD_EndDirective flush = ZSTD_e_continue;
>> >
>> > -        if (i == pages->num - 1) {
>> > +        if (i == pages->normal_num - 1) {
>> >              flush = ZSTD_e_flush;
>> >          }
>> > -        z->in.src = p->pages->block->host + pages->offset[i];
>> > +        z->in.src = p->pages->block->host + pages->normal[i];
>> >          z->in.size = p->page_size;
>> >          z->in.pos = 0;
>> >
>> > @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
>> >      p->iov[p->iovs_num].iov_len = z->out.pos;
>> >      p->iovs_num++;
>> >      p->next_packet_size = z->out.pos;
>> > -    p->flags |= MULTIFD_FLAG_ZSTD;
>> >
>> > +out:
>> > +    p->flags |= MULTIFD_FLAG_ZSTD;
>> >      multifd_send_fill_packet(p);
>> > -
>> >      return 0;
>> >  }
>> >
>> > @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
>> >                     p->id, flags, MULTIFD_FLAG_ZSTD);
>> >          return -1;
>> >      }
>> > +
>> > +    multifd_zero_page_check_recv(p);
>> > +
>> > +    if (!p->normal_num) {
>> > +        assert(in_size == 0);
>> > +        return 0;
>> > +    }
>> > +
>> >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
>> >
>> >      if (ret != 0) {
>> > diff --git a/migration/multifd.c b/migration/multifd.c
>> > index a33dba40d9..fbb40ea10b 100644
>> > --- a/migration/multifd.c
>> > +++ b/migration/multifd.c
>> > @@ -11,6 +11,7 @@
>> >   */
>> >
>> >  #include "qemu/osdep.h"
>> > +#include "qemu/cutils.h"
>> >  #include "qemu/rcu.h"
>> >  #include "exec/target_page.h"
>> >  #include "sysemu/sysemu.h"
>> > @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>> >      MultiFDPages_t *pages = p->pages;
>> >      int ret;
>> >
>> > +    multifd_zero_page_check_send(p);
>> > +
>> >      if (!use_zero_copy_send) {
>> >          /*
>> >           * Only !zerocopy needs the header in IOV; zerocopy will
>> > @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
>> >          multifd_send_prepare_header(p);
>> >      }
>> >
>> > -    for (int i = 0; i < pages->num; i++) {
>> > -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
>> > +    for (int i = 0; i < pages->normal_num; i++) {
>> > +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
>> >          p->iov[p->iovs_num].iov_len = p->page_size;
>> >          p->iovs_num++;
>> >      }
>> >
>> > -    p->next_packet_size = pages->num * p->page_size;
>> > +    p->next_packet_size = pages->normal_num * p->page_size;
>> >      p->flags |= MULTIFD_FLAG_NOCOMP;
>> >
>> >      multifd_send_fill_packet(p);
>> > @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
>> >                     p->id, flags, MULTIFD_FLAG_NOCOMP);
>> >          return -1;
>> >      }
>> > +
>> > +    multifd_zero_page_check_recv(p);
>> > +
>> > +    if (!p->normal_num) {
>> > +        return 0;
>> > +    }
>> > +
>> >      for (int i = 0; i < p->normal_num; i++) {
>> >          p->iov[i].iov_base = p->host + p->normal[i];
>> >          p->iov[i].iov_len = p->page_size;
>> > @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>> >
>> >      packet->flags = cpu_to_be32(p->flags);
>> >      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
>> > -    packet->normal_pages = cpu_to_be32(pages->num);
>> > +    packet->normal_pages = cpu_to_be32(pages->normal_num);
>> >      packet->zero_pages = cpu_to_be32(pages->zero_num);
>> >      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
>> >
>> > @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
>> >          strncpy(packet->ramblock, pages->block->idstr, 256);
>> >      }
>> >
>> > -    for (i = 0; i < pages->num; i++) {
>> > +    for (i = 0; i < pages->normal_num; i++) {
>> >          /* there are architectures where ram_addr_t is 32 bit */
>> > -        uint64_t temp = pages->offset[i];
>> > +        uint64_t temp = pages->normal[i];
>> >
>> >          packet->offset[i] = cpu_to_be64(temp);
>> >      }
>> >
>> > +    for (i = 0; i < pages->zero_num; i++) {
>> > +        /* there are architectures where ram_addr_t is 32 bit */
>> > +        uint64_t temp = pages->zero[i];
>> > +
>> > +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
>> > +    }
>> > +
>> >      p->packets_sent++;
>> > -    p->total_normal_pages += pages->num;
>> > +    p->total_normal_pages += pages->normal_num;
>> >      p->total_zero_pages += pages->zero_num;
>> >
>> > -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
>> > +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
>> >                         p->flags, p->next_packet_size);
>> >  }
>> >
>> > @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
>> >          p->normal[i] = offset;
>> >      }
>> >
>> > +    for (i = 0; i < p->zero_num; i++) {
>> > +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
>> > +
>> > +        if (offset > (p->block->used_length - p->page_size)) {
>> > +            error_setg(errp, "multifd: offset too long %" PRIu64
>> > +                       " (max " RAM_ADDR_FMT ")",
>> > +                       offset, p->block->used_length);
>> > +            return -1;
>> > +        }
>> > +        p->zero[i] = offset;
>> > +    }
>> > +
>> >      return 0;
>> >  }
>> >
>> > @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
>> >
>> >              stat64_add(&mig_stats.multifd_bytes,
>> >                         p->next_packet_size + p->packet_len);
>> > -            stat64_add(&mig_stats.normal_pages, pages->num);
>> > +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
>> >              stat64_add(&mig_stats.zero_pages, pages->zero_num);
>> >
>> >              multifd_pages_reset(p->pages);
>> > @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
>> >          p->flags &= ~MULTIFD_FLAG_SYNC;
>> >          qemu_mutex_unlock(&p->mutex);
>> >
>> > -        if (p->normal_num) {
>> > +        if (p->normal_num + p->zero_num) {
>> > +            assert(!(flags & MULTIFD_FLAG_SYNC));
>>
>> This breaks 8.2 -> 9.0 migration. QEMU 8.2 is still sending the SYNC
>> along with the data packet.
>
> I keep missing the compatibility thing. Will remove this.
>

If you send this through CI it will catch these issues on x86 at least.

You can also keep an 8.2 build and run the tests in your local machine
like this:

$ git checkout <devel>
$ mkdir build
$ cd build
$ configure, make

$ git checkout v8.2.0
$ mkdir ../build-8.2.0
$ cd ../build-8.2.0
$ configure, make

# this has to use the tests from the *previous* version
# 8.2 -> devel
$ QTEST_QEMU_BINARY_SRC=./qemu-system-x86_64  \
  QTEST_QEMU_BINARY=../build/qemu-system-x86_64 \
  ./tests/qtest/migration-test

# devel -> 8.2
$ QTEST_QEMU_BINARY_DST=./qemu-system-x86_64  \
  QTEST_QEMU_BINARY=../build/qemu-system-x86_64 \
  ./tests/qtest/migration-test

>>
>> >              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
>> >              if (ret != 0) {
>> >                  break;
>> > diff --git a/migration/multifd.h b/migration/multifd.h
>> > index 9822ff298a..125f0bbe60 100644
>> > --- a/migration/multifd.h
>> > +++ b/migration/multifd.h
>> > @@ -53,6 +53,11 @@ typedef struct {
>> >      uint32_t unused32[1];    /* Reserved for future use */
>> >      uint64_t unused64[3];    /* Reserved for future use */
>> >      char ramblock[256];
>> > +    /*
>> > +     * This array contains the pointers to:
>> > +     *  - normal pages (initial normal_pages entries)
>> > +     *  - zero pages (following zero_pages entries)
>> > +     */
>> >      uint64_t offset[];
>> >  } __attribute__((packed)) MultiFDPacket_t;
>> >
>> > @@ -224,6 +229,8 @@ typedef struct {
>> >
>> >  void multifd_register_ops(int method, MultiFDMethods *ops);
>> >  void multifd_send_fill_packet(MultiFDSendParams *p);
>> > +void multifd_zero_page_check_send(MultiFDSendParams *p);
>> > +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
>> >
>> >  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
>> >  {
>> > diff --git a/qapi/migration.json b/qapi/migration.json
>> > index 99843a8e95..e2450b92d4 100644
>> > --- a/qapi/migration.json
>> > +++ b/qapi/migration.json
>> > @@ -660,9 +660,11 @@
>> >  #
>> >  # @none: Do not perform zero page checking.
>> >  #
>> > +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
>> > +#
>> >  ##
>> >  { 'enum': 'ZeroPageDetection',
>> > -  'data': [ 'legacy', 'none' ] }
>> > +  'data': [ 'legacy', 'none', 'multifd' ] }
>> >
>> >  ##
>> >  # @BitmapMigrationBitmapAliasTransform:
Hao Xiang Feb. 24, 2024, 7:06 p.m. UTC | #11
On Thu, Feb 22, 2024 at 8:38 PM Hao Xiang <hao.xiang@bytedance.com> wrote:
>
> On Fri, Feb 16, 2024 at 9:08 PM Richard Henderson
> <richard.henderson@linaro.org> wrote:
> >
> > On 2/16/24 12:39, Hao Xiang wrote:
> > > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > > +{
> > > +    for (int i = 0; i < p->zero_num; i++) {
> > > +        void *page = p->host + p->zero[i];
> > > +        if (!buffer_is_zero(page, p->page_size)) {
> > > +            memset(page, 0, p->page_size);
> > > +        }
> > > +    }
> > > +}
> >
> > You should not check the buffer is zero here, you should just zero it.
>
> I will fix it in the next version.

I tested with zero out all pages but the performance is bad compared
to previously. In my test case, most pages are zero pages. I think
what happened is that the destination host already has the pages being
zero so performing a memcmp is much faster than memset on all zero
pages.

>
> >
> >
> > r~
Hao Xiang Feb. 24, 2024, 10:56 p.m. UTC | #12
On Thu, Feb 22, 2024 at 9:15 PM Hao Xiang <hao.xiang@bytedance.com> wrote:
>
> On Thu, Feb 22, 2024 at 6:21 PM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Wed, Feb 21, 2024 at 06:04:10PM -0300, Fabiano Rosas wrote:
> > > Hao Xiang <hao.xiang@bytedance.com> writes:
> > >
> > > > 1. Implements the zero page detection and handling on the multifd
> > > > threads for non-compression, zlib and zstd compression backends.
> > > > 2. Added a new value 'multifd' in ZeroPageDetection enumeration.
> > > > 3. Add proper asserts to ensure pages->normal are used for normal pages
> > > > in all scenarios.
> > > >
> > > > Signed-off-by: Hao Xiang <hao.xiang@bytedance.com>
> > > > ---
> > > >  migration/meson.build         |  1 +
> > > >  migration/multifd-zero-page.c | 59 +++++++++++++++++++++++++++++++++++
> > > >  migration/multifd-zlib.c      | 26 ++++++++++++---
> > > >  migration/multifd-zstd.c      | 25 ++++++++++++---
> > > >  migration/multifd.c           | 50 +++++++++++++++++++++++------
> > > >  migration/multifd.h           |  7 +++++
> > > >  qapi/migration.json           |  4 ++-
> > > >  7 files changed, 151 insertions(+), 21 deletions(-)
> > > >  create mode 100644 migration/multifd-zero-page.c
> > > >
> > > > diff --git a/migration/meson.build b/migration/meson.build
> > > > index 92b1cc4297..1eeb915ff6 100644
> > > > --- a/migration/meson.build
> > > > +++ b/migration/meson.build
> > > > @@ -22,6 +22,7 @@ system_ss.add(files(
> > > >    'migration.c',
> > > >    'multifd.c',
> > > >    'multifd-zlib.c',
> > > > +  'multifd-zero-page.c',
> > > >    'ram-compress.c',
> > > >    'options.c',
> > > >    'postcopy-ram.c',
> > > > diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
> > > > new file mode 100644
> > > > index 0000000000..f0cd8e2c53
> > > > --- /dev/null
> > > > +++ b/migration/multifd-zero-page.c
> > > > @@ -0,0 +1,59 @@
> > > > +/*
> > > > + * Multifd zero page detection implementation.
> > > > + *
> > > > + * Copyright (c) 2024 Bytedance Inc
> > > > + *
> > > > + * Authors:
> > > > + *  Hao Xiang <hao.xiang@bytedance.com>
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > > + * See the COPYING file in the top-level directory.
> > > > + */
> > > > +
> > > > +#include "qemu/osdep.h"
> > > > +#include "qemu/cutils.h"
> > > > +#include "exec/ramblock.h"
> > > > +#include "migration.h"
> > > > +#include "multifd.h"
> > > > +#include "options.h"
> > > > +#include "ram.h"
> > > > +
> > > > +void multifd_zero_page_check_send(MultiFDSendParams *p)
> > > > +{
> > > > +    /*
> > > > +     * QEMU older than 9.0 don't understand zero page
> > > > +     * on multifd channel. This switch is required to
> > > > +     * maintain backward compatibility.
> > > > +     */
> > > > +    bool use_multifd_zero_page =
> > > > +        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
> > > > +    MultiFDPages_t *pages = p->pages;
> > > > +    RAMBlock *rb = pages->block;
> > > > +
> > > > +    assert(pages->num != 0);
> > > > +    assert(pages->normal_num == 0);
> > > > +    assert(pages->zero_num == 0);
> > >
> > > We can drop these before the final version.
> > >
> > > > +
> > > > +    for (int i = 0; i < pages->num; i++) {
> > > > +        uint64_t offset = pages->offset[i];
> > > > +        if (use_multifd_zero_page &&
> > > > +            buffer_is_zero(rb->host + offset, p->page_size)) {
> > > > +            pages->zero[pages->zero_num] = offset;
> > > > +            pages->zero_num++;
> > > > +            ram_release_page(rb->idstr, offset);
> > > > +        } else {
> > > > +            pages->normal[pages->normal_num] = offset;
> > > > +            pages->normal_num++;
> > > > +        }
> > > > +    }
> > >
> > > I don't think it's super clean to have three arrays offset, zero and
> > > normal, all sized for the full packet size. It might be possible to just
> > > carry a bitmap of non-zero pages along with pages->offset and operate on
> > > that instead.
> > >
> > > What do you think?
> > >
> > > Peter, any ideas? Should we just leave this for another time?
> >
> > Yeah I think a bitmap should save quite a few fields indeed, it'll however
> > make the latter iteration slightly harder by walking both (offset[],
> > bitmap), process the page only if bitmap is set for the offset.
> >
> > IIUC we perhaps don't even need a bitmap?  AFAIU what we only need in
> > Multifdpages_t is one extra field to mark "how many normal pages", aka,
> > normal_num here (zero_num can be calculated from num-normal_num).  Then
> > the zero page detection logic should do two things:
> >
> >   - Sort offset[] array so that it starts with normal pages, followed up by
> >     zero pages
> >
> >   - Setup normal_num to be the number of normal pages
> >
> > Then we reduce 2 new arrays (normal[], zero[]) + 2 new fields (normal_num,
> > zero_num) -> 1 new field (normal_num).  It'll also be trivial to fill the
> > packet header later because offset[] is exactly that.
> >
> > Side note - I still think it's confusing to read this patch and previous
> > patch separately.  Obviously previous patch introduced these new fields
> > without justifying their values yet.  IMHO it'll be easier to review if you
> > merge the two patches.
>
> Fabiano, thanks for catching this. I totally missed the backward
> compatibility thing.
> Peter, I will code the sorting and merge this patch with the previous one.
>
It turns out that we still need to add a "zero_pages" field in
MultiFDPacket_t because the existing field "pages_alloc" is not the
total number of pages in "offset". So source can set "zero_pages" from
pages->num - pages->num_normal but "zero_pages" needs to be set in the
packet.
> >
> > >
> > > > +}
> > > > +
> > > > +void multifd_zero_page_check_recv(MultiFDRecvParams *p)
> > > > +{
> > > > +    for (int i = 0; i < p->zero_num; i++) {
> > > > +        void *page = p->host + p->zero[i];
> > > > +        if (!buffer_is_zero(page, p->page_size)) {
> > > > +            memset(page, 0, p->page_size);
> > > > +        }
> > > > +    }
> > > > +}
> > > > diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
> > > > index 012e3bdea1..cdfe0fa70e 100644
> > > > --- a/migration/multifd-zlib.c
> > > > +++ b/migration/multifd-zlib.c
> > > > @@ -123,13 +123,20 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >      int ret;
> > > >      uint32_t i;
> > > >
> > > > +    multifd_zero_page_check_send(p);
> > > > +
> > > > +    if (!pages->normal_num) {
> > > > +        p->next_packet_size = 0;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > >      multifd_send_prepare_header(p);
> > > >
> > > > -    for (i = 0; i < pages->num; i++) {
> > > > +    for (i = 0; i < pages->normal_num; i++) {
> > > >          uint32_t available = z->zbuff_len - out_size;
> > > >          int flush = Z_NO_FLUSH;
> > > >
> > > > -        if (i == pages->num - 1) {
> > > > +        if (i == pages->normal_num - 1) {
> > > >              flush = Z_SYNC_FLUSH;
> > > >          }
> > > >
> > > > @@ -138,7 +145,7 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >           * with compression. zlib does not guarantee that this is safe,
> > > >           * therefore copy the page before calling deflate().
> > > >           */
> > > > -        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
> > > > +        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
> > > >          zs->avail_in = p->page_size;
> > > >          zs->next_in = z->buf;
> > > >
> > > > @@ -172,10 +179,10 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >      p->iov[p->iovs_num].iov_len = out_size;
> > > >      p->iovs_num++;
> > > >      p->next_packet_size = out_size;
> > > > -    p->flags |= MULTIFD_FLAG_ZLIB;
> > > >
> > > > +out:
> > > > +    p->flags |= MULTIFD_FLAG_ZLIB;
> > > >      multifd_send_fill_packet(p);
> > > > -
> > > >      return 0;
> > > >  }
> > > >
> > > > @@ -261,6 +268,14 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> > > >                     p->id, flags, MULTIFD_FLAG_ZLIB);
> > > >          return -1;
> > > >      }
> > > > +
> > > > +    multifd_zero_page_check_recv(p);
> > > > +
> > > > +    if (!p->normal_num) {
> > > > +        assert(in_size == 0);
> > > > +        return 0;
> > > > +    }
> > > > +
> > > >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> > > >
> > > >      if (ret != 0) {
> > > > @@ -310,6 +325,7 @@ static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
> > > >                     p->id, out_size, expected_size);
> > > >          return -1;
> > > >      }
> > > > +
> > > >      return 0;
> > > >  }
> > > >
> > > > diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
> > > > index dc8fe43e94..27a1eba075 100644
> > > > --- a/migration/multifd-zstd.c
> > > > +++ b/migration/multifd-zstd.c
> > > > @@ -118,19 +118,26 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >      int ret;
> > > >      uint32_t i;
> > > >
> > > > +    multifd_zero_page_check_send(p);
> > > > +
> > > > +    if (!pages->normal_num) {
> > > > +        p->next_packet_size = 0;
> > > > +        goto out;
> > > > +    }
> > > > +
> > > >      multifd_send_prepare_header(p);
> >
> > If this forms a pattern we can introduce multifd_send_prepare_common():
>
> I will add that in the next version.
>
> >
> > bool multifd_send_prepare_common()
> > {
> >     multifd_zero_page_check_send();
> >     if (...) {
> >
> >     }
> >     multifd_send_prepare_header();
> > }
> >
> > > >
> > > >      z->out.dst = z->zbuff;
> > > >      z->out.size = z->zbuff_len;
> > > >      z->out.pos = 0;
> > > >
> > > > -    for (i = 0; i < pages->num; i++) {
> > > > +    for (i = 0; i < pages->normal_num; i++) {
> > > >          ZSTD_EndDirective flush = ZSTD_e_continue;
> > > >
> > > > -        if (i == pages->num - 1) {
> > > > +        if (i == pages->normal_num - 1) {
> > > >              flush = ZSTD_e_flush;
> > > >          }
> > > > -        z->in.src = p->pages->block->host + pages->offset[i];
> > > > +        z->in.src = p->pages->block->host + pages->normal[i];
> > > >          z->in.size = p->page_size;
> > > >          z->in.pos = 0;
> > > >
> > > > @@ -161,10 +168,10 @@ static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >      p->iov[p->iovs_num].iov_len = z->out.pos;
> > > >      p->iovs_num++;
> > > >      p->next_packet_size = z->out.pos;
> > > > -    p->flags |= MULTIFD_FLAG_ZSTD;
> > > >
> > > > +out:
> > > > +    p->flags |= MULTIFD_FLAG_ZSTD;
> > > >      multifd_send_fill_packet(p);
> > > > -
> > > >      return 0;
> > > >  }
> > > >
> > > > @@ -257,6 +264,14 @@ static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
> > > >                     p->id, flags, MULTIFD_FLAG_ZSTD);
> > > >          return -1;
> > > >      }
> > > > +
> > > > +    multifd_zero_page_check_recv(p);
> > > > +
> > > > +    if (!p->normal_num) {
> > > > +        assert(in_size == 0);
> > > > +        return 0;
> > > > +    }
> > > > +
> > > >      ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
> > > >
> > > >      if (ret != 0) {
> > > > diff --git a/migration/multifd.c b/migration/multifd.c
> > > > index a33dba40d9..fbb40ea10b 100644
> > > > --- a/migration/multifd.c
> > > > +++ b/migration/multifd.c
> > > > @@ -11,6 +11,7 @@
> > > >   */
> > > >
> > > >  #include "qemu/osdep.h"
> > > > +#include "qemu/cutils.h"
> > > >  #include "qemu/rcu.h"
> > > >  #include "exec/target_page.h"
> > > >  #include "sysemu/sysemu.h"
> > > > @@ -126,6 +127,8 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >      MultiFDPages_t *pages = p->pages;
> > > >      int ret;
> > > >
> > > > +    multifd_zero_page_check_send(p);
> > > > +
> > > >      if (!use_zero_copy_send) {
> > > >          /*
> > > >           * Only !zerocopy needs the header in IOV; zerocopy will
> > > > @@ -134,13 +137,13 @@ static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
> > > >          multifd_send_prepare_header(p);
> > > >      }
> > > >
> > > > -    for (int i = 0; i < pages->num; i++) {
> > > > -        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
> > > > +    for (int i = 0; i < pages->normal_num; i++) {
> > > > +        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
> > > >          p->iov[p->iovs_num].iov_len = p->page_size;
> > > >          p->iovs_num++;
> > > >      }
> > > >
> > > > -    p->next_packet_size = pages->num * p->page_size;
> > > > +    p->next_packet_size = pages->normal_num * p->page_size;
> > > >      p->flags |= MULTIFD_FLAG_NOCOMP;
> > > >
> > > >      multifd_send_fill_packet(p);
> > > > @@ -202,6 +205,13 @@ static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
> > > >                     p->id, flags, MULTIFD_FLAG_NOCOMP);
> > > >          return -1;
> > > >      }
> > > > +
> > > > +    multifd_zero_page_check_recv(p);
> > > > +
> > > > +    if (!p->normal_num) {
> > > > +        return 0;
> > > > +    }
> > > > +
> > > >      for (int i = 0; i < p->normal_num; i++) {
> > > >          p->iov[i].iov_base = p->host + p->normal[i];
> > > >          p->iov[i].iov_len = p->page_size;
> > > > @@ -339,7 +349,7 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> > > >
> > > >      packet->flags = cpu_to_be32(p->flags);
> > > >      packet->pages_alloc = cpu_to_be32(p->pages->allocated);
> > > > -    packet->normal_pages = cpu_to_be32(pages->num);
> > > > +    packet->normal_pages = cpu_to_be32(pages->normal_num);
> > > >      packet->zero_pages = cpu_to_be32(pages->zero_num);
> > > >      packet->next_packet_size = cpu_to_be32(p->next_packet_size);
> > > >
> > > > @@ -350,18 +360,25 @@ void multifd_send_fill_packet(MultiFDSendParams *p)
> > > >          strncpy(packet->ramblock, pages->block->idstr, 256);
> > > >      }
> > > >
> > > > -    for (i = 0; i < pages->num; i++) {
> > > > +    for (i = 0; i < pages->normal_num; i++) {
> > > >          /* there are architectures where ram_addr_t is 32 bit */
> > > > -        uint64_t temp = pages->offset[i];
> > > > +        uint64_t temp = pages->normal[i];
> > > >
> > > >          packet->offset[i] = cpu_to_be64(temp);
> > > >      }
> > > >
> > > > +    for (i = 0; i < pages->zero_num; i++) {
> > > > +        /* there are architectures where ram_addr_t is 32 bit */
> > > > +        uint64_t temp = pages->zero[i];
> > > > +
> > > > +        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
> > > > +    }
> > > > +
> > > >      p->packets_sent++;
> > > > -    p->total_normal_pages += pages->num;
> > > > +    p->total_normal_pages += pages->normal_num;
> > > >      p->total_zero_pages += pages->zero_num;
> > > >
> > > > -    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
> > > > +    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
> > > >                         p->flags, p->next_packet_size);
> > > >  }
> > > >
> > > > @@ -451,6 +468,18 @@ static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
> > > >          p->normal[i] = offset;
> > > >      }
> > > >
> > > > +    for (i = 0; i < p->zero_num; i++) {
> > > > +        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
> > > > +
> > > > +        if (offset > (p->block->used_length - p->page_size)) {
> > > > +            error_setg(errp, "multifd: offset too long %" PRIu64
> > > > +                       " (max " RAM_ADDR_FMT ")",
> > > > +                       offset, p->block->used_length);
> > > > +            return -1;
> > > > +        }
> > > > +        p->zero[i] = offset;
> > > > +    }
> > > > +
> > > >      return 0;
> > > >  }
> > > >
> > > > @@ -842,7 +871,7 @@ static void *multifd_send_thread(void *opaque)
> > > >
> > > >              stat64_add(&mig_stats.multifd_bytes,
> > > >                         p->next_packet_size + p->packet_len);
> > > > -            stat64_add(&mig_stats.normal_pages, pages->num);
> > > > +            stat64_add(&mig_stats.normal_pages, pages->normal_num);
> > > >              stat64_add(&mig_stats.zero_pages, pages->zero_num);
> > > >
> > > >              multifd_pages_reset(p->pages);
> > > > @@ -1256,7 +1285,8 @@ static void *multifd_recv_thread(void *opaque)
> > > >          p->flags &= ~MULTIFD_FLAG_SYNC;
> > > >          qemu_mutex_unlock(&p->mutex);
> > > >
> > > > -        if (p->normal_num) {
> > > > +        if (p->normal_num + p->zero_num) {
> > > > +            assert(!(flags & MULTIFD_FLAG_SYNC));
> > >
> > > This breaks 8.2 -> 9.0 migration. QEMU 8.2 is still sending the SYNC
> > > along with the data packet.
> > >
> > > >              ret = multifd_recv_state->ops->recv_pages(p, &local_err);
> > > >              if (ret != 0) {
> > > >                  break;
> > > > diff --git a/migration/multifd.h b/migration/multifd.h
> > > > index 9822ff298a..125f0bbe60 100644
> > > > --- a/migration/multifd.h
> > > > +++ b/migration/multifd.h
> > > > @@ -53,6 +53,11 @@ typedef struct {
> > > >      uint32_t unused32[1];    /* Reserved for future use */
> > > >      uint64_t unused64[3];    /* Reserved for future use */
> > > >      char ramblock[256];
> > > > +    /*
> > > > +     * This array contains the pointers to:
> > > > +     *  - normal pages (initial normal_pages entries)
> > > > +     *  - zero pages (following zero_pages entries)
> > > > +     */
> > > >      uint64_t offset[];
> > > >  } __attribute__((packed)) MultiFDPacket_t;
> > > >
> > > > @@ -224,6 +229,8 @@ typedef struct {
> > > >
> > > >  void multifd_register_ops(int method, MultiFDMethods *ops);
> > > >  void multifd_send_fill_packet(MultiFDSendParams *p);
> > > > +void multifd_zero_page_check_send(MultiFDSendParams *p);
> > > > +void multifd_zero_page_check_recv(MultiFDRecvParams *p);
> > > >
> > > >  static inline void multifd_send_prepare_header(MultiFDSendParams *p)
> > > >  {
> > > > diff --git a/qapi/migration.json b/qapi/migration.json
> > > > index 99843a8e95..e2450b92d4 100644
> > > > --- a/qapi/migration.json
> > > > +++ b/qapi/migration.json
> > > > @@ -660,9 +660,11 @@
> > > >  #
> > > >  # @none: Do not perform zero page checking.
> > > >  #
> > > > +# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
> > > > +#
> > > >  ##
> > > >  { 'enum': 'ZeroPageDetection',
> > > > -  'data': [ 'legacy', 'none' ] }
> > > > +  'data': [ 'legacy', 'none', 'multifd' ] }
> > > >
> > > >  ##
> > > >  # @BitmapMigrationBitmapAliasTransform:
> > >
> >
> > --
> > Peter Xu
> >
Peter Xu Feb. 26, 2024, 1:30 a.m. UTC | #13
On Sat, Feb 24, 2024 at 02:56:15PM -0800, Hao Xiang wrote:
> > > > I don't think it's super clean to have three arrays offset, zero and
> > > > normal, all sized for the full packet size. It might be possible to just
> > > > carry a bitmap of non-zero pages along with pages->offset and operate on
> > > > that instead.
> > > >
> > > > What do you think?
> > > >
> > > > Peter, any ideas? Should we just leave this for another time?
> > >
> > > Yeah I think a bitmap should save quite a few fields indeed, it'll however
> > > make the latter iteration slightly harder by walking both (offset[],
> > > bitmap), process the page only if bitmap is set for the offset.
> > >
> > > IIUC we perhaps don't even need a bitmap?  AFAIU what we only need in
> > > Multifdpages_t is one extra field to mark "how many normal pages", aka,
> > > normal_num here (zero_num can be calculated from num-normal_num).  Then
> > > the zero page detection logic should do two things:
> > >
> > >   - Sort offset[] array so that it starts with normal pages, followed up by
> > >     zero pages
> > >
> > >   - Setup normal_num to be the number of normal pages
> > >
> > > Then we reduce 2 new arrays (normal[], zero[]) + 2 new fields (normal_num,
> > > zero_num) -> 1 new field (normal_num).  It'll also be trivial to fill the
> > > packet header later because offset[] is exactly that.
> > >
> > > Side note - I still think it's confusing to read this patch and previous
> > > patch separately.  Obviously previous patch introduced these new fields
> > > without justifying their values yet.  IMHO it'll be easier to review if you
> > > merge the two patches.
> >
> > Fabiano, thanks for catching this. I totally missed the backward
> > compatibility thing.
> > Peter, I will code the sorting and merge this patch with the previous one.
> >
> It turns out that we still need to add a "zero_pages" field in
> MultiFDPacket_t because the existing field "pages_alloc" is not the
> total number of pages in "offset". So source can set "zero_pages" from
> pages->num - pages->num_normal but "zero_pages" needs to be set in the
> packet.

Yes, one more field should be needed in MultiFDPacket_t.  Noet that what I
said above was about Multifdpages_t, not MultiFDPacket_t (which is the wire
protocol instead).  To support zero page offloading we should need one more
field for each.

IMHO MultiFDPacket_t.pages_alloc is redundant and actually not useful..
It's just that it existed in the wire protocol already so maybe we'd still
better keep it there..
diff mbox series

Patch

diff --git a/migration/meson.build b/migration/meson.build
index 92b1cc4297..1eeb915ff6 100644
--- a/migration/meson.build
+++ b/migration/meson.build
@@ -22,6 +22,7 @@  system_ss.add(files(
   'migration.c',
   'multifd.c',
   'multifd-zlib.c',
+  'multifd-zero-page.c',
   'ram-compress.c',
   'options.c',
   'postcopy-ram.c',
diff --git a/migration/multifd-zero-page.c b/migration/multifd-zero-page.c
new file mode 100644
index 0000000000..f0cd8e2c53
--- /dev/null
+++ b/migration/multifd-zero-page.c
@@ -0,0 +1,59 @@ 
+/*
+ * Multifd zero page detection implementation.
+ *
+ * Copyright (c) 2024 Bytedance Inc
+ *
+ * Authors:
+ *  Hao Xiang <hao.xiang@bytedance.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "exec/ramblock.h"
+#include "migration.h"
+#include "multifd.h"
+#include "options.h"
+#include "ram.h"
+
+void multifd_zero_page_check_send(MultiFDSendParams *p)
+{
+    /*
+     * QEMU older than 9.0 don't understand zero page
+     * on multifd channel. This switch is required to
+     * maintain backward compatibility.
+     */
+    bool use_multifd_zero_page =
+        (migrate_zero_page_detection() == ZERO_PAGE_DETECTION_MULTIFD);
+    MultiFDPages_t *pages = p->pages;
+    RAMBlock *rb = pages->block;
+
+    assert(pages->num != 0);
+    assert(pages->normal_num == 0);
+    assert(pages->zero_num == 0);
+
+    for (int i = 0; i < pages->num; i++) {
+        uint64_t offset = pages->offset[i];
+        if (use_multifd_zero_page &&
+            buffer_is_zero(rb->host + offset, p->page_size)) {
+            pages->zero[pages->zero_num] = offset;
+            pages->zero_num++;
+            ram_release_page(rb->idstr, offset);
+        } else {
+            pages->normal[pages->normal_num] = offset;
+            pages->normal_num++;
+        }
+    }
+}
+
+void multifd_zero_page_check_recv(MultiFDRecvParams *p)
+{
+    for (int i = 0; i < p->zero_num; i++) {
+        void *page = p->host + p->zero[i];
+        if (!buffer_is_zero(page, p->page_size)) {
+            memset(page, 0, p->page_size);
+        }
+    }
+}
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 012e3bdea1..cdfe0fa70e 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -123,13 +123,20 @@  static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
     int ret;
     uint32_t i;
 
+    multifd_zero_page_check_send(p);
+
+    if (!pages->normal_num) {
+        p->next_packet_size = 0;
+        goto out;
+    }
+
     multifd_send_prepare_header(p);
 
-    for (i = 0; i < pages->num; i++) {
+    for (i = 0; i < pages->normal_num; i++) {
         uint32_t available = z->zbuff_len - out_size;
         int flush = Z_NO_FLUSH;
 
-        if (i == pages->num - 1) {
+        if (i == pages->normal_num - 1) {
             flush = Z_SYNC_FLUSH;
         }
 
@@ -138,7 +145,7 @@  static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
          * with compression. zlib does not guarantee that this is safe,
          * therefore copy the page before calling deflate().
          */
-        memcpy(z->buf, p->pages->block->host + pages->offset[i], p->page_size);
+        memcpy(z->buf, p->pages->block->host + pages->normal[i], p->page_size);
         zs->avail_in = p->page_size;
         zs->next_in = z->buf;
 
@@ -172,10 +179,10 @@  static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
     p->iov[p->iovs_num].iov_len = out_size;
     p->iovs_num++;
     p->next_packet_size = out_size;
-    p->flags |= MULTIFD_FLAG_ZLIB;
 
+out:
+    p->flags |= MULTIFD_FLAG_ZLIB;
     multifd_send_fill_packet(p);
-
     return 0;
 }
 
@@ -261,6 +268,14 @@  static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, flags, MULTIFD_FLAG_ZLIB);
         return -1;
     }
+
+    multifd_zero_page_check_recv(p);
+
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
     ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
 
     if (ret != 0) {
@@ -310,6 +325,7 @@  static int zlib_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, out_size, expected_size);
         return -1;
     }
+
     return 0;
 }
 
diff --git a/migration/multifd-zstd.c b/migration/multifd-zstd.c
index dc8fe43e94..27a1eba075 100644
--- a/migration/multifd-zstd.c
+++ b/migration/multifd-zstd.c
@@ -118,19 +118,26 @@  static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
     int ret;
     uint32_t i;
 
+    multifd_zero_page_check_send(p);
+
+    if (!pages->normal_num) {
+        p->next_packet_size = 0;
+        goto out;
+    }
+
     multifd_send_prepare_header(p);
 
     z->out.dst = z->zbuff;
     z->out.size = z->zbuff_len;
     z->out.pos = 0;
 
-    for (i = 0; i < pages->num; i++) {
+    for (i = 0; i < pages->normal_num; i++) {
         ZSTD_EndDirective flush = ZSTD_e_continue;
 
-        if (i == pages->num - 1) {
+        if (i == pages->normal_num - 1) {
             flush = ZSTD_e_flush;
         }
-        z->in.src = p->pages->block->host + pages->offset[i];
+        z->in.src = p->pages->block->host + pages->normal[i];
         z->in.size = p->page_size;
         z->in.pos = 0;
 
@@ -161,10 +168,10 @@  static int zstd_send_prepare(MultiFDSendParams *p, Error **errp)
     p->iov[p->iovs_num].iov_len = z->out.pos;
     p->iovs_num++;
     p->next_packet_size = z->out.pos;
-    p->flags |= MULTIFD_FLAG_ZSTD;
 
+out:
+    p->flags |= MULTIFD_FLAG_ZSTD;
     multifd_send_fill_packet(p);
-
     return 0;
 }
 
@@ -257,6 +264,14 @@  static int zstd_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, flags, MULTIFD_FLAG_ZSTD);
         return -1;
     }
+
+    multifd_zero_page_check_recv(p);
+
+    if (!p->normal_num) {
+        assert(in_size == 0);
+        return 0;
+    }
+
     ret = qio_channel_read_all(p->c, (void *)z->zbuff, in_size, errp);
 
     if (ret != 0) {
diff --git a/migration/multifd.c b/migration/multifd.c
index a33dba40d9..fbb40ea10b 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -11,6 +11,7 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "qemu/rcu.h"
 #include "exec/target_page.h"
 #include "sysemu/sysemu.h"
@@ -126,6 +127,8 @@  static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
     MultiFDPages_t *pages = p->pages;
     int ret;
 
+    multifd_zero_page_check_send(p);
+
     if (!use_zero_copy_send) {
         /*
          * Only !zerocopy needs the header in IOV; zerocopy will
@@ -134,13 +137,13 @@  static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
         multifd_send_prepare_header(p);
     }
 
-    for (int i = 0; i < pages->num; i++) {
-        p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
+    for (int i = 0; i < pages->normal_num; i++) {
+        p->iov[p->iovs_num].iov_base = pages->block->host + pages->normal[i];
         p->iov[p->iovs_num].iov_len = p->page_size;
         p->iovs_num++;
     }
 
-    p->next_packet_size = pages->num * p->page_size;
+    p->next_packet_size = pages->normal_num * p->page_size;
     p->flags |= MULTIFD_FLAG_NOCOMP;
 
     multifd_send_fill_packet(p);
@@ -202,6 +205,13 @@  static int nocomp_recv_pages(MultiFDRecvParams *p, Error **errp)
                    p->id, flags, MULTIFD_FLAG_NOCOMP);
         return -1;
     }
+
+    multifd_zero_page_check_recv(p);
+
+    if (!p->normal_num) {
+        return 0;
+    }
+
     for (int i = 0; i < p->normal_num; i++) {
         p->iov[i].iov_base = p->host + p->normal[i];
         p->iov[i].iov_len = p->page_size;
@@ -339,7 +349,7 @@  void multifd_send_fill_packet(MultiFDSendParams *p)
 
     packet->flags = cpu_to_be32(p->flags);
     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
-    packet->normal_pages = cpu_to_be32(pages->num);
+    packet->normal_pages = cpu_to_be32(pages->normal_num);
     packet->zero_pages = cpu_to_be32(pages->zero_num);
     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 
@@ -350,18 +360,25 @@  void multifd_send_fill_packet(MultiFDSendParams *p)
         strncpy(packet->ramblock, pages->block->idstr, 256);
     }
 
-    for (i = 0; i < pages->num; i++) {
+    for (i = 0; i < pages->normal_num; i++) {
         /* there are architectures where ram_addr_t is 32 bit */
-        uint64_t temp = pages->offset[i];
+        uint64_t temp = pages->normal[i];
 
         packet->offset[i] = cpu_to_be64(temp);
     }
 
+    for (i = 0; i < pages->zero_num; i++) {
+        /* there are architectures where ram_addr_t is 32 bit */
+        uint64_t temp = pages->zero[i];
+
+        packet->offset[pages->normal_num + i] = cpu_to_be64(temp);
+    }
+
     p->packets_sent++;
-    p->total_normal_pages += pages->num;
+    p->total_normal_pages += pages->normal_num;
     p->total_zero_pages += pages->zero_num;
 
-    trace_multifd_send(p->id, packet_num, pages->num, pages->zero_num,
+    trace_multifd_send(p->id, packet_num, pages->normal_num, pages->zero_num,
                        p->flags, p->next_packet_size);
 }
 
@@ -451,6 +468,18 @@  static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
         p->normal[i] = offset;
     }
 
+    for (i = 0; i < p->zero_num; i++) {
+        uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
+
+        if (offset > (p->block->used_length - p->page_size)) {
+            error_setg(errp, "multifd: offset too long %" PRIu64
+                       " (max " RAM_ADDR_FMT ")",
+                       offset, p->block->used_length);
+            return -1;
+        }
+        p->zero[i] = offset;
+    }
+
     return 0;
 }
 
@@ -842,7 +871,7 @@  static void *multifd_send_thread(void *opaque)
 
             stat64_add(&mig_stats.multifd_bytes,
                        p->next_packet_size + p->packet_len);
-            stat64_add(&mig_stats.normal_pages, pages->num);
+            stat64_add(&mig_stats.normal_pages, pages->normal_num);
             stat64_add(&mig_stats.zero_pages, pages->zero_num);
 
             multifd_pages_reset(p->pages);
@@ -1256,7 +1285,8 @@  static void *multifd_recv_thread(void *opaque)
         p->flags &= ~MULTIFD_FLAG_SYNC;
         qemu_mutex_unlock(&p->mutex);
 
-        if (p->normal_num) {
+        if (p->normal_num + p->zero_num) {
+            assert(!(flags & MULTIFD_FLAG_SYNC));
             ret = multifd_recv_state->ops->recv_pages(p, &local_err);
             if (ret != 0) {
                 break;
diff --git a/migration/multifd.h b/migration/multifd.h
index 9822ff298a..125f0bbe60 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -53,6 +53,11 @@  typedef struct {
     uint32_t unused32[1];    /* Reserved for future use */
     uint64_t unused64[3];    /* Reserved for future use */
     char ramblock[256];
+    /*
+     * This array contains the pointers to:
+     *  - normal pages (initial normal_pages entries)
+     *  - zero pages (following zero_pages entries)
+     */
     uint64_t offset[];
 } __attribute__((packed)) MultiFDPacket_t;
 
@@ -224,6 +229,8 @@  typedef struct {
 
 void multifd_register_ops(int method, MultiFDMethods *ops);
 void multifd_send_fill_packet(MultiFDSendParams *p);
+void multifd_zero_page_check_send(MultiFDSendParams *p);
+void multifd_zero_page_check_recv(MultiFDRecvParams *p);
 
 static inline void multifd_send_prepare_header(MultiFDSendParams *p)
 {
diff --git a/qapi/migration.json b/qapi/migration.json
index 99843a8e95..e2450b92d4 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -660,9 +660,11 @@ 
 #
 # @none: Do not perform zero page checking.
 #
+# @multifd: Perform zero page checking on the multifd sender thread. (since 9.0)
+#
 ##
 { 'enum': 'ZeroPageDetection',
-  'data': [ 'legacy', 'none' ] }
+  'data': [ 'legacy', 'none', 'multifd' ] }
 
 ##
 # @BitmapMigrationBitmapAliasTransform: