diff mbox series

[RFC,11/21] migration: Add hugetlb-doublemap cap

Message ID 20230117220914.2062125-12-peterx@redhat.com
State New
Headers show
Series migration: Support hugetlb doublemaps | expand

Commit Message

Peter Xu Jan. 17, 2023, 10:09 p.m. UTC
Add a new cap to allow mapping hugetlbfs backed RAMs in small page sizes.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/migration.c | 48 ++++++++++++++++++++++++++++++++++++++++++-
 migration/migration.h |  1 +
 qapi/migration.json   |  7 ++++++-
 3 files changed, 54 insertions(+), 2 deletions(-)

Comments

Dr. David Alan Gilbert Jan. 24, 2023, 12:45 p.m. UTC | #1
* Peter Xu (peterx@redhat.com) wrote:
> Add a new cap to allow mapping hugetlbfs backed RAMs in small page sizes.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>


Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

although, I'm curious if the protocol actually changes - or whether
a doublepage enabled destination would work with an unmodified source?
I guess potentially you can get away without the dirty clearing of the
partially sent hugepages that the source normally does.

Dave

> ---
>  migration/migration.c | 48 ++++++++++++++++++++++++++++++++++++++++++-
>  migration/migration.h |  1 +
>  qapi/migration.json   |  7 ++++++-
>  3 files changed, 54 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 64f74534e2..b174f2af92 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -17,6 +17,7 @@
>  #include "qemu/cutils.h"
>  #include "qemu/error-report.h"
>  #include "qemu/main-loop.h"
> +#include "qemu/madvise.h"
>  #include "migration/blocker.h"
>  #include "exec.h"
>  #include "fd.h"
> @@ -62,6 +63,7 @@
>  #include "sysemu/cpus.h"
>  #include "yank_functions.h"
>  #include "sysemu/qtest.h"
> +#include "exec/ramblock.h"
>  
>  #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
>  
> @@ -1363,12 +1365,47 @@ static bool migrate_caps_check(bool *cap_list,
>                     "Zero copy only available for non-compressed non-TLS multifd migration");
>          return false;
>      }
> +
> +    if (cap_list[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP]) {
> +        RAMBlock *rb;
> +
> +        /* Check whether the platform/binary supports the new madvise()s */
> +
> +#if QEMU_MADV_SPLIT == QEMU_MADV_INVALID
> +        error_setg(errp, "MADV_SPLIT is not supported by the QEMU binary");
> +        return false;
> +#endif
> +
> +#if QEMU_MADV_COLLAPSE == QEMU_MADV_INVALID
> +        error_setg(errp, "MADV_COLLAPSE is not supported by the QEMU binary");
> +        return false;
> +#endif
> +
> +        /*
> +         * Check against kernel support of MADV_SPLIT is not easy, delay
> +         * that until we have all the hugetlb mappings ready on dest node,
> +         * meanwhile do the best effort check here because doublemap
> +         * requires the hugetlb ramblocks to be shared first.
> +         */
> +        RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
> +            if (qemu_ram_is_hugetlb(rb) && !qemu_ram_is_shared(rb)) {
> +                error_setg(errp, "RAMBlock '%s' needs to be shared for doublemap",
> +                           rb->idstr);
> +                return false;
> +            }
> +        }
> +    }
>  #else
>      if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) {
>          error_setg(errp,
>                     "Zero copy currently only available on Linux");
>          return false;
>      }
> +
> +    if (cap_list[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP]) {
> +        error_setg(errp, "Hugetlb doublemap is only supported on Linux");
> +        return false;
> +    }
>  #endif
>  
>      if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) {
> @@ -2792,6 +2829,13 @@ bool migrate_postcopy_preempt(void)
>      return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT];
>  }
>  
> +bool migrate_hugetlb_doublemap(void)
> +{
> +    MigrationState *s = migrate_get_current();
> +
> +    return s->enabled_capabilities[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP];
> +}
> +
>  /* migration thread support */
>  /*
>   * Something bad happened to the RP stream, mark an error
> @@ -4472,7 +4516,9 @@ static Property migration_properties[] = {
>      DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
>      DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
>      DEFINE_PROP_MIG_CAP("x-background-snapshot",
> -            MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
> +                        MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
> +    DEFINE_PROP_MIG_CAP("hugetlb-doublemap",
> +                        MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP),
>  #ifdef CONFIG_LINUX
>      DEFINE_PROP_MIG_CAP("x-zero-copy-send",
>              MIGRATION_CAPABILITY_ZERO_COPY_SEND),
> diff --git a/migration/migration.h b/migration/migration.h
> index 5674a13876..bbd610a2d5 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -447,6 +447,7 @@ bool migrate_use_events(void);
>  bool migrate_postcopy_blocktime(void);
>  bool migrate_background_snapshot(void);
>  bool migrate_postcopy_preempt(void);
> +bool migrate_hugetlb_doublemap(void);
>  
>  /* Sending on the return path - generic and then for each message type */
>  void migrate_send_rp_shut(MigrationIncomingState *mis,
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 88ecf86ac8..b23516e75e 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -477,6 +477,11 @@
>  #                    will be handled faster.  This is a performance feature and
>  #                    should not affect the correctness of postcopy migration.
>  #                    (since 7.1)
> +# @hugetlb-doublemap: If enabled, the migration process will allow postcopy
> +#                     to handle page faults based on small pages even if
> +#                     hugetlb is used.  This will drastically reduce page
> +#                     fault latencies when hugetlb is used as the guest RAM
> +#                     backends. (since 7.3)
>  #
>  # Features:
>  # @unstable: Members @x-colo and @x-ignore-shared are experimental.
> @@ -492,7 +497,7 @@
>             'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
>             { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
>             'validate-uuid', 'background-snapshot',
> -           'zero-copy-send', 'postcopy-preempt'] }
> +           'zero-copy-send', 'postcopy-preempt', 'hugetlb-doublemap'] }
>  
>  ##
>  # @MigrationCapabilityStatus:
> -- 
> 2.37.3
>
Peter Xu Jan. 24, 2023, 9:15 p.m. UTC | #2
On Tue, Jan 24, 2023 at 12:45:38PM +0000, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > Add a new cap to allow mapping hugetlbfs backed RAMs in small page sizes.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> 
> 
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Thanks.

> 
> although, I'm curious if the protocol actually changes

Yes it does.

It differs not in the form of a changed header or any frame definitions,
but in the format of how huge pages are sent.  The old binary can only send
a huge page by sending all the small pages sequentially starting from index
0 to index N_HUGE-1; while the new binary can send the huge page out of
order.  For the latter it's the same as when huge page is not used.

> or whether a doublepage enabled destination would work with an unmodified
> source?

This is an interesting question.

I would expect old -> new work as usual, because the page frames are not
modified so the dest node will just see pages being migrated in a
sequential manner.  The latency of page request will be the same as old
binary though because even if dest host can handle small pages it won't be
able to get asap on the pages it wants - src host decides which page to
send.

Meanwhile new -> old shouldn't work I think as described above, because the
dest host should see weird things happening, e.g., a huge page was sent not
starting fron index 0 but index X (0<X<N_HUGE-1).  It should quickly bail
out assuming there's something wrong.

> I guess potentially you can get away without the dirty clearing
> of the partially sent hugepages that the source normally does.

Good point. It's actually more relevant to the other patch later on
reworking the discard logic.  I kept it as-is for majorly two reasons:

 1) It is still not 100% confirmed on how MADV_DONTNEED should behave on
    HGM enabled memory ranges where huge pages used to be mapped.  It's
    part of the discussion upstream on the kernel patchset.  I think it's
    settling, but in the current series I kept it in a form so it'll work
    in all cases.

 2) Not dirtying the partially sent huge pages can always reduce small
    pages being migrated, but it can also change the content of discard
    messages due to the frame format of MIG_CMD_POSTCOPY_RAM_DISCARD, in
    that we can have a lot more scattered ranges, so a lot more messaging
    can be needed.  While when with the existing logic, since we'll always
    re-dirty the partial sent pages, the ranges are more likely to be
    efficient.
    
        * CMD_POSTCOPY_RAM_DISCARD consist of:
        *      byte   version (0)
        *      byte   Length of name field (not including 0)
        *  n x byte   RAM block name
        *      byte   0 terminator (just for safety)
        *  n x        Byte ranges within the named RAMBlock
        *      be64   Start of the range
        *      be64   Length

I think 1) may not hold as the kernel series evolves, so it may not be true
anymore.  2) may still be true, but I think worth some testing (especially
on 1G pages) to see how it could interfere the discard procedure.  Maybe it
won't be as bad as I think.  Even if it could, we can evaluate the tradeoff
between "slower discard sync" and "less page need to send".  E.g., we can
consider changing the frame layout by boosting postcopy_ram_discard_version.

I'll take a note on this one and provide more update in the next version.
Juan Quintela Jan. 30, 2023, 5:13 a.m. UTC | #3
Peter Xu <peterx@redhat.com> wrote:
> Add a new cap to allow mapping hugetlbfs backed RAMs in small page sizes.
>
> Signed-off-by: Peter Xu <peterx@redhat.com>

Reviewed-by: Juan Quintela <quintela@redhat.com>

> +bool migrate_hugetlb_doublemap(void)
> +{
> +    MigrationState *s = migrate_get_current();
> +
> +    return s->enabled_capabilities[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP];
> +}

I think it was not our finest moment when we decided to name the
functions that query capabilities without a verb, but well, everything
else uses "this" convention, so ....
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 64f74534e2..b174f2af92 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -17,6 +17,7 @@ 
 #include "qemu/cutils.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
+#include "qemu/madvise.h"
 #include "migration/blocker.h"
 #include "exec.h"
 #include "fd.h"
@@ -62,6 +63,7 @@ 
 #include "sysemu/cpus.h"
 #include "yank_functions.h"
 #include "sysemu/qtest.h"
+#include "exec/ramblock.h"
 
 #define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
 
@@ -1363,12 +1365,47 @@  static bool migrate_caps_check(bool *cap_list,
                    "Zero copy only available for non-compressed non-TLS multifd migration");
         return false;
     }
+
+    if (cap_list[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP]) {
+        RAMBlock *rb;
+
+        /* Check whether the platform/binary supports the new madvise()s */
+
+#if QEMU_MADV_SPLIT == QEMU_MADV_INVALID
+        error_setg(errp, "MADV_SPLIT is not supported by the QEMU binary");
+        return false;
+#endif
+
+#if QEMU_MADV_COLLAPSE == QEMU_MADV_INVALID
+        error_setg(errp, "MADV_COLLAPSE is not supported by the QEMU binary");
+        return false;
+#endif
+
+        /*
+         * Check against kernel support of MADV_SPLIT is not easy, delay
+         * that until we have all the hugetlb mappings ready on dest node,
+         * meanwhile do the best effort check here because doublemap
+         * requires the hugetlb ramblocks to be shared first.
+         */
+        RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
+            if (qemu_ram_is_hugetlb(rb) && !qemu_ram_is_shared(rb)) {
+                error_setg(errp, "RAMBlock '%s' needs to be shared for doublemap",
+                           rb->idstr);
+                return false;
+            }
+        }
+    }
 #else
     if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) {
         error_setg(errp,
                    "Zero copy currently only available on Linux");
         return false;
     }
+
+    if (cap_list[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP]) {
+        error_setg(errp, "Hugetlb doublemap is only supported on Linux");
+        return false;
+    }
 #endif
 
     if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) {
@@ -2792,6 +2829,13 @@  bool migrate_postcopy_preempt(void)
     return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT];
 }
 
+bool migrate_hugetlb_doublemap(void)
+{
+    MigrationState *s = migrate_get_current();
+
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP];
+}
+
 /* migration thread support */
 /*
  * Something bad happened to the RP stream, mark an error
@@ -4472,7 +4516,9 @@  static Property migration_properties[] = {
     DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
     DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
     DEFINE_PROP_MIG_CAP("x-background-snapshot",
-            MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
+                        MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
+    DEFINE_PROP_MIG_CAP("hugetlb-doublemap",
+                        MIGRATION_CAPABILITY_HUGETLB_DOUBLEMAP),
 #ifdef CONFIG_LINUX
     DEFINE_PROP_MIG_CAP("x-zero-copy-send",
             MIGRATION_CAPABILITY_ZERO_COPY_SEND),
diff --git a/migration/migration.h b/migration/migration.h
index 5674a13876..bbd610a2d5 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -447,6 +447,7 @@  bool migrate_use_events(void);
 bool migrate_postcopy_blocktime(void);
 bool migrate_background_snapshot(void);
 bool migrate_postcopy_preempt(void);
+bool migrate_hugetlb_doublemap(void);
 
 /* Sending on the return path - generic and then for each message type */
 void migrate_send_rp_shut(MigrationIncomingState *mis,
diff --git a/qapi/migration.json b/qapi/migration.json
index 88ecf86ac8..b23516e75e 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -477,6 +477,11 @@ 
 #                    will be handled faster.  This is a performance feature and
 #                    should not affect the correctness of postcopy migration.
 #                    (since 7.1)
+# @hugetlb-doublemap: If enabled, the migration process will allow postcopy
+#                     to handle page faults based on small pages even if
+#                     hugetlb is used.  This will drastically reduce page
+#                     fault latencies when hugetlb is used as the guest RAM
+#                     backends. (since 7.3)
 #
 # Features:
 # @unstable: Members @x-colo and @x-ignore-shared are experimental.
@@ -492,7 +497,7 @@ 
            'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
            { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
            'validate-uuid', 'background-snapshot',
-           'zero-copy-send', 'postcopy-preempt'] }
+           'zero-copy-send', 'postcopy-preempt', 'hugetlb-doublemap'] }
 
 ##
 # @MigrationCapabilityStatus: