diff mbox series

[RFC,v2,4/5] Add migration support for KVM guest with MTE

Message ID 881871e8394fa18a656dfb105d42e6099335c721.1615972140.git.haibo.xu@linaro.org
State New
Headers show
Series target/arm: Add MTE support to KVM guest | expand

Commit Message

Haibo Xu March 17, 2021, 9:28 a.m. UTC
To make it easier to keep the page tags sync with
the page data, tags for one page are appended to
the data during ram save iteration.

This patch only add the pre-copy migration support.
Post-copy and compress as well as zero page saving
are not supported yet.

Signed-off-by: Haibo Xu <haibo.xu@linaro.org>
---
 include/hw/arm/virt.h    |  2 +
 include/migration/misc.h |  1 +
 migration/ram.c          | 86 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 88 insertions(+), 1 deletion(-)

Comments

Dr. David Alan Gilbert March 17, 2021, 8:11 p.m. UTC | #1
* Haibo Xu (haibo.xu@linaro.org) wrote:
> To make it easier to keep the page tags sync with
> the page data, tags for one page are appended to
> the data during ram save iteration.
> 
> This patch only add the pre-copy migration support.
> Post-copy and compress as well as zero page saving
> are not supported yet.
> 
> Signed-off-by: Haibo Xu <haibo.xu@linaro.org>

My guess is that this doesn't work with a lot of other options; e.g.
postcopy and probably compression and a bunch of other things.
Postcopy I can see you'll need some interesting kernel changes for -
you'd need to be able to atomically place a  page with it's tag data.

You probably need to add stuff to migrate_caps_check  to disable
features that you don't support.

> ---
>  include/hw/arm/virt.h    |  2 +
>  include/migration/misc.h |  1 +
>  migration/ram.c          | 86 +++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 88 insertions(+), 1 deletion(-)
> 
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index 921416f918..8b28cde8bf 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -166,6 +166,8 @@ struct VirtMachineState {
>      PCIBus *bus;
>      char *oem_id;
>      char *oem_table_id;
> +    /* migrate memory tags */
> +    NotifierWithReturn precopy_notifier;
>  };
>  
>  #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index bccc1b6b44..005133f471 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n);
>  void precopy_remove_notifier(NotifierWithReturn *n);
>  int precopy_notify(PrecopyNotifyReason reason, Error **errp);
>  void precopy_enable_free_page_optimization(void);
> +void precopy_enable_metadata_migration(void);
>  
>  void ram_mig_init(void);
>  void qemu_guest_free_page_hint(void *addr, size_t len);
> diff --git a/migration/ram.c b/migration/ram.c
> index 72143da0ac..e67b798c3b 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -53,10 +53,12 @@
>  #include "block.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/cpu-throttle.h"
> +#include "sysemu/kvm.h"
>  #include "savevm.h"
>  #include "qemu/iov.h"
>  #include "multifd.h"
>  #include "sysemu/runstate.h"
> +#include "kvm_arm.h"
>  
>  #if defined(__linux__)
>  #include "qemu/userfaultfd.h"
> @@ -80,6 +82,9 @@
>  #define RAM_SAVE_FLAG_XBZRLE   0x40
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
> +#define RAM_SAVE_FLAG_MTE              0x200

I think that's using the last free bit in the flags field, which is a
bit painful.

> +#define MTE_GRANULE_SIZE   (16)
>  
>  static inline bool is_zero_range(uint8_t *p, uint64_t size)
>  {
> @@ -317,6 +322,8 @@ struct RAMState {
>      bool ram_bulk_stage;
>      /* The free page optimization is enabled */
>      bool fpo_enabled;
> +    /* The RAM meta data(e.t memory tag) is enabled */
> +    bool metadata_enabled;
>      /* How many times we have dirty too many pages */
>      int dirty_rate_high_cnt;
>      /* these variables are used for bitmap sync */
> @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void)
>      ram_state->fpo_enabled = true;
>  }
>  
> +void precopy_enable_metadata_migration(void)
> +{
> +    if (!ram_state) {
> +        return;
> +    }
> +
> +    ram_state->metadata_enabled = true;
> +}
> +
>  uint64_t ram_bytes_remaining(void)
>  {
>      return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
> @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>      return true;
>  }
>  
> +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> +{
> +    uint8_t *tag_buf = NULL;
> +    uint64_t ipa;
> +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;

This function needs to be mostly somewhere aarch specific; or somehow
made more generic.
We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something
like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with
something else glued onto pages can do something similar.
Try and keep migration/ architecture independent.

> +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> +        /* Buffer for the page tags(one byte per tag) */
> +        tag_buf = g_try_malloc0(size);

It feels like you want to allocate tag_buf in setup and free it at the
end rather than doing this in every page.

> +        if (!tag_buf) {
> +            error_report("%s: Error allocating MTE tag_buf", __func__);
> +            return 0;
> +        }
> +
> +        if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
> +            error_report("%s: Can't get MTE tags from guest", __func__);

For any error like this you probably want to say the addresses to make
it easier to debug when it fails.

> +            g_free(tag_buf);
> +            return 0;
> +        }
> +
> +        qemu_put_buffer(f, tag_buf, size);
> +
> +        g_free(tag_buf);
> +
> +        return size;
> +    }
> +
> +    return 0;
> +}
> +
> +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> +{
> +    uint8_t *tag_buf = NULL;
> +    uint64_t ipa;
> +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> +
> +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> +        /* Buffer for the page tags(one byte per tag) */
> +        tag_buf = g_try_malloc0(size);
> +        if (!tag_buf) {
> +            error_report("%s: Error allocating MTE tag_buf", __func__);
> +            return;
> +        }
> +
> +        qemu_get_buffer(f, tag_buf, size);
> +        if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {

what protections are there here to stop you setting the mte on something
useful, like part of the host kernel or qemu?

> +            error_report("%s: Can't set MTE tags to guest", __func__);
> +        }
> +
> +        g_free(tag_buf);
> +    }
> +
> +    return;
> +}
> +
>  /*
>   * directly send the page to the stream
>   *
> @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>                              uint8_t *buf, bool async)
>  {
> +    if (rs->metadata_enabled) {
> +        offset |= RAM_SAVE_FLAG_MTE;
> +    }
> +
>      ram_counters.transferred += save_page_header(rs, rs->f, block,
>                                                   offset | RAM_SAVE_FLAG_PAGE);
>      if (async) {
> @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>      }
>      ram_counters.transferred += TARGET_PAGE_SIZE;
>      ram_counters.normal++;
> +
> +    if (rs->metadata_enabled) {
> +        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
> +    }
> +
>      return 1;
>  }
>  
> @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs)
>      rs->last_version = ram_list.version;
>      rs->ram_bulk_stage = true;
>      rs->fpo_enabled = false;
> +    rs->metadata_enabled = false;
>  }
>  
>  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f)
>              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
>          }
>  
> -        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
> +        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {
>          case RAM_SAVE_FLAG_MEM_SIZE:
>              /* Synchronize RAM block list */
>              total_ram_bytes = addr;
> @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f)
>  
>          case RAM_SAVE_FLAG_PAGE:
>              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> +            if (flags & RAM_SAVE_FLAG_MTE) {
> +                load_normal_page_mte_tags(f, host);
> +            }
>              break;
>  
>          case RAM_SAVE_FLAG_COMPRESS_PAGE:
> -- 
> 2.17.1
>
Haibo Xu March 18, 2021, 6:38 a.m. UTC | #2
On Thu, 18 Mar 2021 at 04:11, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
>
> * Haibo Xu (haibo.xu@linaro.org) wrote:
> > To make it easier to keep the page tags sync with
> > the page data, tags for one page are appended to
> > the data during ram save iteration.
> >
> > This patch only add the pre-copy migration support.
> > Post-copy and compress as well as zero page saving
> > are not supported yet.
> >
> > Signed-off-by: Haibo Xu <haibo.xu@linaro.org>
>
> My guess is that this doesn't work with a lot of other options; e.g.
> postcopy and probably compression and a bunch of other things.
> Postcopy I can see you'll need some interesting kernel changes for -
> you'd need to be able to atomically place a  page with it's tag data.
>
> You probably need to add stuff to migrate_caps_check  to disable
> features that you don't support.
>

Hi David,

Thanks so much for the comments!

You are right, this RFC patch only supports pre-copy mode, no
postcopy, no compression.
As a RFC, here just want to finalize the tag migration process, that is:
1. let the tag go with the page data(the current choice) which may be
a little complex to put
    them into the current migration process.
2. migrate them separately which is easy to implement with the current
migration(treat the tags
    as device status), but it would be hard to keep the page data and
tag to sync with each other.
3. Any other ways?

Once the tag migration process is finalized, a new formal patch series
with postcopy as well as
compression should be reworked.

What's more, you mentioned that "some interesting kernel changes are
needed to atomically
place a  page with it's tag data". You mean a single kernel API to
store page data and tag in
the migration load process?

Regards,
Haibo

> > ---
> >  include/hw/arm/virt.h    |  2 +
> >  include/migration/misc.h |  1 +
> >  migration/ram.c          | 86 +++++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 88 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> > index 921416f918..8b28cde8bf 100644
> > --- a/include/hw/arm/virt.h
> > +++ b/include/hw/arm/virt.h
> > @@ -166,6 +166,8 @@ struct VirtMachineState {
> >      PCIBus *bus;
> >      char *oem_id;
> >      char *oem_table_id;
> > +    /* migrate memory tags */
> > +    NotifierWithReturn precopy_notifier;
> >  };
> >
> >  #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
> > diff --git a/include/migration/misc.h b/include/migration/misc.h
> > index bccc1b6b44..005133f471 100644
> > --- a/include/migration/misc.h
> > +++ b/include/migration/misc.h
> > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n);
> >  void precopy_remove_notifier(NotifierWithReturn *n);
> >  int precopy_notify(PrecopyNotifyReason reason, Error **errp);
> >  void precopy_enable_free_page_optimization(void);
> > +void precopy_enable_metadata_migration(void);
> >
> >  void ram_mig_init(void);
> >  void qemu_guest_free_page_hint(void *addr, size_t len);
> > diff --git a/migration/ram.c b/migration/ram.c
> > index 72143da0ac..e67b798c3b 100644
> > --- a/migration/ram.c
> > +++ b/migration/ram.c
> > @@ -53,10 +53,12 @@
> >  #include "block.h"
> >  #include "sysemu/sysemu.h"
> >  #include "sysemu/cpu-throttle.h"
> > +#include "sysemu/kvm.h"
> >  #include "savevm.h"
> >  #include "qemu/iov.h"
> >  #include "multifd.h"
> >  #include "sysemu/runstate.h"
> > +#include "kvm_arm.h"
> >
> >  #if defined(__linux__)
> >  #include "qemu/userfaultfd.h"
> > @@ -80,6 +82,9 @@
> >  #define RAM_SAVE_FLAG_XBZRLE   0x40
> >  /* 0x80 is reserved in migration.h start with 0x100 next */
> >  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
> > +#define RAM_SAVE_FLAG_MTE              0x200
>
> I think that's using the last free bit in the flags field, which is a
> bit painful.
>

Yes, only 0x80(reserved) and 0x200 are available.

> > +#define MTE_GRANULE_SIZE   (16)
> >
> >  static inline bool is_zero_range(uint8_t *p, uint64_t size)
> >  {
> > @@ -317,6 +322,8 @@ struct RAMState {
> >      bool ram_bulk_stage;
> >      /* The free page optimization is enabled */
> >      bool fpo_enabled;
> > +    /* The RAM meta data(e.t memory tag) is enabled */
> > +    bool metadata_enabled;
> >      /* How many times we have dirty too many pages */
> >      int dirty_rate_high_cnt;
> >      /* these variables are used for bitmap sync */
> > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void)
> >      ram_state->fpo_enabled = true;
> >  }
> >
> > +void precopy_enable_metadata_migration(void)
> > +{
> > +    if (!ram_state) {
> > +        return;
> > +    }
> > +
> > +    ram_state->metadata_enabled = true;
> > +}
> > +
> >  uint64_t ram_bytes_remaining(void)
> >  {
> >      return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
> > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >      return true;
> >  }
> >
> > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > +{
> > +    uint8_t *tag_buf = NULL;
> > +    uint64_t ipa;
> > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
>
> This function needs to be mostly somewhere aarch specific; or somehow
> made more generic.
> We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something
> like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with
> something else glued onto pages can do something similar.
> Try and keep migration/ architecture independent.

Yes, it should be arch independent and handled with a more abstract way.

>
> > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > +        /* Buffer for the page tags(one byte per tag) */
> > +        tag_buf = g_try_malloc0(size);
>
> It feels like you want to allocate tag_buf in setup and free it at the
> end rather than doing this in every page.

Yes, the tag buffer allocation could be moved to migration save/load
setup and cleanup function.

>
> > +        if (!tag_buf) {
> > +            error_report("%s: Error allocating MTE tag_buf", __func__);
> > +            return 0;
> > +        }
> > +
> > +        if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
> > +            error_report("%s: Can't get MTE tags from guest", __func__);
>
> For any error like this you probably want to say the addresses to make
> it easier to debug when it fails.
>

Good suggestion! Will add the "address" info to the log.

> > +            g_free(tag_buf);
> > +            return 0;
> > +        }
> > +
> > +        qemu_put_buffer(f, tag_buf, size);
> > +
> > +        g_free(tag_buf);
> > +
> > +        return size;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > +{
> > +    uint8_t *tag_buf = NULL;
> > +    uint64_t ipa;
> > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> > +
> > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > +        /* Buffer for the page tags(one byte per tag) */
> > +        tag_buf = g_try_malloc0(size);
> > +        if (!tag_buf) {
> > +            error_report("%s: Error allocating MTE tag_buf", __func__);
> > +            return;
> > +        }
> > +
> > +        qemu_get_buffer(f, tag_buf, size);
> > +        if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
>
> what protections are there here to stop you setting the mte on something
> useful, like part of the host kernel or qemu?

Kernel would do some parameter check(e.t ipa within a valid slot)
before setting the tag data.

>
> > +            error_report("%s: Can't set MTE tags to guest", __func__);
> > +        }
> > +
> > +        g_free(tag_buf);
> > +    }
> > +
> > +    return;
> > +}
> > +
> >  /*
> >   * directly send the page to the stream
> >   *
> > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >                              uint8_t *buf, bool async)
> >  {
> > +    if (rs->metadata_enabled) {
> > +        offset |= RAM_SAVE_FLAG_MTE;
> > +    }
> > +
> >      ram_counters.transferred += save_page_header(rs, rs->f, block,
> >                                                   offset | RAM_SAVE_FLAG_PAGE);
> >      if (async) {
> > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >      }
> >      ram_counters.transferred += TARGET_PAGE_SIZE;
> >      ram_counters.normal++;
> > +
> > +    if (rs->metadata_enabled) {
> > +        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
> > +    }
> > +
> >      return 1;
> >  }
> >
> > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs)
> >      rs->last_version = ram_list.version;
> >      rs->ram_bulk_stage = true;
> >      rs->fpo_enabled = false;
> > +    rs->metadata_enabled = false;
> >  }
> >
> >  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f)
> >              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
> >          }
> >
> > -        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
> > +        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {
> >          case RAM_SAVE_FLAG_MEM_SIZE:
> >              /* Synchronize RAM block list */
> >              total_ram_bytes = addr;
> > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f)
> >
> >          case RAM_SAVE_FLAG_PAGE:
> >              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> > +            if (flags & RAM_SAVE_FLAG_MTE) {
> > +                load_normal_page_mte_tags(f, host);
> > +            }
> >              break;
> >
> >          case RAM_SAVE_FLAG_COMPRESS_PAGE:
> > --
> > 2.17.1
> >
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Juan Quintela March 25, 2021, 3:37 p.m. UTC | #3
Haibo Xu <haibo.xu@linaro.org> wrote:
> To make it easier to keep the page tags sync with
> the page data, tags for one page are appended to
> the data during ram save iteration.
>
> This patch only add the pre-copy migration support.
> Post-copy and compress as well as zero page saving
> are not supported yet.
>
> Signed-off-by: Haibo Xu <haibo.xu@linaro.org>


>  #define RAM_SAVE_FLAG_XBZRLE   0x40
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
> +#define RAM_SAVE_FLAG_MTE              0x200

Flags are really a scarce resource.  You are using one here, when you
know that you will always have the feature enable (or not), so you can
do better during negotiation IMHO.



> +void precopy_enable_metadata_migration(void)
> +{
> +    if (!ram_state) {
> +        return;
> +    }
> +
> +    ram_state->metadata_enabled = true;
> +}

My understanding is that in your following patch, if mte is enabled, you
will always sent mte tags, for all pages needed, right?

> +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> +{
> +    uint8_t *tag_buf = NULL;
> +    uint64_t ipa;
> +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> +
> +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> +        /* Buffer for the page tags(one byte per tag) */
> +        tag_buf = g_try_malloc0(size);

size of the buffer is known at start of migration.  Just get a buffer
and reuse it?

Do zero pages have mte tags?  From migration point of view, a zero page
is a page that is just full of zeros, i.e. nothing else special.
Because you are not sending any for them.



> @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>                              uint8_t *buf, bool async)
>  {
> +    if (rs->metadata_enabled) {
> +        offset |= RAM_SAVE_FLAG_MTE;

You don't really need the flag, for you normal pages are just
TARGET_PAGE_SIZE + (TARGET_PAGE_SIZE/MTE_)


> +    }
> +
>      ram_counters.transferred += save_page_header(rs, rs->f, block,
>                                                   offset | RAM_SAVE_FLAG_PAGE);
>      if (async) {
> @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
>      }
>      ram_counters.transferred += TARGET_PAGE_SIZE;
>      ram_counters.normal++;
> +
> +    if (rs->metadata_enabled) {

See?  You are not checking the flag, you are checking the bool setup at
the beggining of migration.

> +        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
> +    }
> +
>      return 1;
>  }
>  
> @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs)
>      rs->last_version = ram_list.version;
>      rs->ram_bulk_stage = true;
>      rs->fpo_enabled = false;
> +    rs->metadata_enabled = false;
>  }
>  
>  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f)
>              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
>          }
>  
> -        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
> +        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {

Creating the flag is hurting you here also.

>          case RAM_SAVE_FLAG_MEM_SIZE:
>              /* Synchronize RAM block list */
>              total_ram_bytes = addr;
> @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f)
>  
>          case RAM_SAVE_FLAG_PAGE:
>              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> +            if (flags & RAM_SAVE_FLAG_MTE) {
> +                load_normal_page_mte_tags(f, host);
> +            }

I don't claim to understand the MTE, but my understanding is that if we
are using MTE, all pages have to have MTE flags, right?

So, somtehing like

is_mte_enabled()

that I told in the other thread looks like a good idea.

Later, Juan.

>              break;
>  
>          case RAM_SAVE_FLAG_COMPRESS_PAGE:
Dr. David Alan Gilbert March 25, 2021, 7:37 p.m. UTC | #4
* Haibo Xu (haibo.xu@linaro.org) wrote:
> On Thu, 18 Mar 2021 at 04:11, Dr. David Alan Gilbert
> <dgilbert@redhat.com> wrote:
> >
> > * Haibo Xu (haibo.xu@linaro.org) wrote:
> > > To make it easier to keep the page tags sync with
> > > the page data, tags for one page are appended to
> > > the data during ram save iteration.
> > >
> > > This patch only add the pre-copy migration support.
> > > Post-copy and compress as well as zero page saving
> > > are not supported yet.
> > >
> > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org>
> >
> > My guess is that this doesn't work with a lot of other options; e.g.
> > postcopy and probably compression and a bunch of other things.
> > Postcopy I can see you'll need some interesting kernel changes for -
> > you'd need to be able to atomically place a  page with it's tag data.
> >
> > You probably need to add stuff to migrate_caps_check  to disable
> > features that you don't support.
> >
> 
> Hi David,
> 
> Thanks so much for the comments!
> 
> You are right, this RFC patch only supports pre-copy mode, no
> postcopy, no compression.
> As a RFC, here just want to finalize the tag migration process, that is:
> 1. let the tag go with the page data(the current choice) which may be
> a little complex to put
>     them into the current migration process.

I think it's probably the easiest.

> 2. migrate them separately which is easy to implement with the current
> migration(treat the tags
>     as device status), but it would be hard to keep the page data and
> tag to sync with each other.
> 3. Any other ways?
> 
> Once the tag migration process is finalized, a new formal patch series
> with postcopy as well as
> compression should be reworked.

It's probably best to look what you would have to do to fit it in with
those before finalising the basic version; not necessary implement them
- just get an idea.

> What's more, you mentioned that "some interesting kernel changes are
> needed to atomically
> place a  page with it's tag data". You mean a single kernel API to
> store page data and tag in
> the migration load process?

Yes;  see migration/postcopy-ram.c postcopy_place_page  - it has the
problem of placing the whole page in memory before the guest starts
usign it; I think for MTE you're going to have to put the MTE data in at
the same time.

Dave

> 
> Regards,
> Haibo
> 
> > > ---
> > >  include/hw/arm/virt.h    |  2 +
> > >  include/migration/misc.h |  1 +
> > >  migration/ram.c          | 86 +++++++++++++++++++++++++++++++++++++++-
> > >  3 files changed, 88 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> > > index 921416f918..8b28cde8bf 100644
> > > --- a/include/hw/arm/virt.h
> > > +++ b/include/hw/arm/virt.h
> > > @@ -166,6 +166,8 @@ struct VirtMachineState {
> > >      PCIBus *bus;
> > >      char *oem_id;
> > >      char *oem_table_id;
> > > +    /* migrate memory tags */
> > > +    NotifierWithReturn precopy_notifier;
> > >  };
> > >
> > >  #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
> > > diff --git a/include/migration/misc.h b/include/migration/misc.h
> > > index bccc1b6b44..005133f471 100644
> > > --- a/include/migration/misc.h
> > > +++ b/include/migration/misc.h
> > > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n);
> > >  void precopy_remove_notifier(NotifierWithReturn *n);
> > >  int precopy_notify(PrecopyNotifyReason reason, Error **errp);
> > >  void precopy_enable_free_page_optimization(void);
> > > +void precopy_enable_metadata_migration(void);
> > >
> > >  void ram_mig_init(void);
> > >  void qemu_guest_free_page_hint(void *addr, size_t len);
> > > diff --git a/migration/ram.c b/migration/ram.c
> > > index 72143da0ac..e67b798c3b 100644
> > > --- a/migration/ram.c
> > > +++ b/migration/ram.c
> > > @@ -53,10 +53,12 @@
> > >  #include "block.h"
> > >  #include "sysemu/sysemu.h"
> > >  #include "sysemu/cpu-throttle.h"
> > > +#include "sysemu/kvm.h"
> > >  #include "savevm.h"
> > >  #include "qemu/iov.h"
> > >  #include "multifd.h"
> > >  #include "sysemu/runstate.h"
> > > +#include "kvm_arm.h"
> > >
> > >  #if defined(__linux__)
> > >  #include "qemu/userfaultfd.h"
> > > @@ -80,6 +82,9 @@
> > >  #define RAM_SAVE_FLAG_XBZRLE   0x40
> > >  /* 0x80 is reserved in migration.h start with 0x100 next */
> > >  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
> > > +#define RAM_SAVE_FLAG_MTE              0x200
> >
> > I think that's using the last free bit in the flags field, which is a
> > bit painful.
> >
> 
> Yes, only 0x80(reserved) and 0x200 are available.
> 
> > > +#define MTE_GRANULE_SIZE   (16)
> > >
> > >  static inline bool is_zero_range(uint8_t *p, uint64_t size)
> > >  {
> > > @@ -317,6 +322,8 @@ struct RAMState {
> > >      bool ram_bulk_stage;
> > >      /* The free page optimization is enabled */
> > >      bool fpo_enabled;
> > > +    /* The RAM meta data(e.t memory tag) is enabled */
> > > +    bool metadata_enabled;
> > >      /* How many times we have dirty too many pages */
> > >      int dirty_rate_high_cnt;
> > >      /* these variables are used for bitmap sync */
> > > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void)
> > >      ram_state->fpo_enabled = true;
> > >  }
> > >
> > > +void precopy_enable_metadata_migration(void)
> > > +{
> > > +    if (!ram_state) {
> > > +        return;
> > > +    }
> > > +
> > > +    ram_state->metadata_enabled = true;
> > > +}
> > > +
> > >  uint64_t ram_bytes_remaining(void)
> > >  {
> > >      return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
> > > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > >      return true;
> > >  }
> > >
> > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > > +{
> > > +    uint8_t *tag_buf = NULL;
> > > +    uint64_t ipa;
> > > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> >
> > This function needs to be mostly somewhere aarch specific; or somehow
> > made more generic.
> > We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something
> > like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with
> > something else glued onto pages can do something similar.
> > Try and keep migration/ architecture independent.
> 
> Yes, it should be arch independent and handled with a more abstract way.
> 
> >
> > > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > > +        /* Buffer for the page tags(one byte per tag) */
> > > +        tag_buf = g_try_malloc0(size);
> >
> > It feels like you want to allocate tag_buf in setup and free it at the
> > end rather than doing this in every page.
> 
> Yes, the tag buffer allocation could be moved to migration save/load
> setup and cleanup function.
> 
> >
> > > +        if (!tag_buf) {
> > > +            error_report("%s: Error allocating MTE tag_buf", __func__);
> > > +            return 0;
> > > +        }
> > > +
> > > +        if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
> > > +            error_report("%s: Can't get MTE tags from guest", __func__);
> >
> > For any error like this you probably want to say the addresses to make
> > it easier to debug when it fails.
> >
> 
> Good suggestion! Will add the "address" info to the log.
> 
> > > +            g_free(tag_buf);
> > > +            return 0;
> > > +        }
> > > +
> > > +        qemu_put_buffer(f, tag_buf, size);
> > > +
> > > +        g_free(tag_buf);
> > > +
> > > +        return size;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > > +{
> > > +    uint8_t *tag_buf = NULL;
> > > +    uint64_t ipa;
> > > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> > > +
> > > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > > +        /* Buffer for the page tags(one byte per tag) */
> > > +        tag_buf = g_try_malloc0(size);
> > > +        if (!tag_buf) {
> > > +            error_report("%s: Error allocating MTE tag_buf", __func__);
> > > +            return;
> > > +        }
> > > +
> > > +        qemu_get_buffer(f, tag_buf, size);
> > > +        if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
> >
> > what protections are there here to stop you setting the mte on something
> > useful, like part of the host kernel or qemu?
> 
> Kernel would do some parameter check(e.t ipa within a valid slot)
> before setting the tag data.
> 
> >
> > > +            error_report("%s: Can't set MTE tags to guest", __func__);
> > > +        }
> > > +
> > > +        g_free(tag_buf);
> > > +    }
> > > +
> > > +    return;
> > > +}
> > > +
> > >  /*
> > >   * directly send the page to the stream
> > >   *
> > > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > >  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > >                              uint8_t *buf, bool async)
> > >  {
> > > +    if (rs->metadata_enabled) {
> > > +        offset |= RAM_SAVE_FLAG_MTE;
> > > +    }
> > > +
> > >      ram_counters.transferred += save_page_header(rs, rs->f, block,
> > >                                                   offset | RAM_SAVE_FLAG_PAGE);
> > >      if (async) {
> > > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > >      }
> > >      ram_counters.transferred += TARGET_PAGE_SIZE;
> > >      ram_counters.normal++;
> > > +
> > > +    if (rs->metadata_enabled) {
> > > +        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
> > > +    }
> > > +
> > >      return 1;
> > >  }
> > >
> > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs)
> > >      rs->last_version = ram_list.version;
> > >      rs->ram_bulk_stage = true;
> > >      rs->fpo_enabled = false;
> > > +    rs->metadata_enabled = false;
> > >  }
> > >
> > >  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> > > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f)
> > >              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
> > >          }
> > >
> > > -        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
> > > +        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {
> > >          case RAM_SAVE_FLAG_MEM_SIZE:
> > >              /* Synchronize RAM block list */
> > >              total_ram_bytes = addr;
> > > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f)
> > >
> > >          case RAM_SAVE_FLAG_PAGE:
> > >              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> > > +            if (flags & RAM_SAVE_FLAG_MTE) {
> > > +                load_normal_page_mte_tags(f, host);
> > > +            }
> > >              break;
> > >
> > >          case RAM_SAVE_FLAG_COMPRESS_PAGE:
> > > --
> > > 2.17.1
> > >
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> >
>
Haibo Xu April 1, 2021, 1:26 p.m. UTC | #5
On Thu, 25 Mar 2021 at 23:37, Juan Quintela <quintela@redhat.com> wrote:
>
> Haibo Xu <haibo.xu@linaro.org> wrote:
> > To make it easier to keep the page tags sync with
> > the page data, tags for one page are appended to
> > the data during ram save iteration.
> >
> > This patch only add the pre-copy migration support.
> > Post-copy and compress as well as zero page saving
> > are not supported yet.
> >
> > Signed-off-by: Haibo Xu <haibo.xu@linaro.org>
>
>
> >  #define RAM_SAVE_FLAG_XBZRLE   0x40
> >  /* 0x80 is reserved in migration.h start with 0x100 next */
> >  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
> > +#define RAM_SAVE_FLAG_MTE              0x200
>
> Flags are really a scarce resource.  You are using one here, when you
> know that you will always have the feature enable (or not), so you can
> do better during negotiation IMHO.
>

Yes, any suggestions are welcomed to finalize the MTE migration support!

>
> > +void precopy_enable_metadata_migration(void)
> > +{
> > +    if (!ram_state) {
> > +        return;
> > +    }
> > +
> > +    ram_state->metadata_enabled = true;
> > +}
>
> My understanding is that in your following patch, if mte is enabled, you
> will always sent mte tags, for all pages needed, right?

Yes, for the current kernel support, we can't tell whether the MTE was enabled
for a specific page.

>
> > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > +{
> > +    uint8_t *tag_buf = NULL;
> > +    uint64_t ipa;
> > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> > +
> > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > +        /* Buffer for the page tags(one byte per tag) */
> > +        tag_buf = g_try_malloc0(size);
>
> size of the buffer is known at start of migration.  Just get a buffer
> and reuse it?

Yes, we can pre-allocate the buffer during the migration setup time

>
> Do zero pages have mte tags?  From migration point of view, a zero page
> is a page that is just full of zeros, i.e. nothing else special.
> Because you are not sending any for them.
>

Yes, I think we can do some optimization for the zero page migration support.

>
>
> > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >                              uint8_t *buf, bool async)
> >  {
> > +    if (rs->metadata_enabled) {
> > +        offset |= RAM_SAVE_FLAG_MTE;
>
> You don't really need the flag, for you normal pages are just
> TARGET_PAGE_SIZE + (TARGET_PAGE_SIZE/MTE_)
>

So you suggest to use the size field to indicate whether tags are available?

>
> > +    }
> > +
> >      ram_counters.transferred += save_page_header(rs, rs->f, block,
> >                                                   offset | RAM_SAVE_FLAG_PAGE);
> >      if (async) {
> > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> >      }
> >      ram_counters.transferred += TARGET_PAGE_SIZE;
> >      ram_counters.normal++;
> > +
> > +    if (rs->metadata_enabled) {
>
> See?  You are not checking the flag, you are checking the bool setup at
> the beggining of migration.

The idea is to only migrate the memory tags when MTE was enabled for the VM.
Could you be more elaborate on "You are not checking the flag"?

>
> > +        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
> > +    }
> > +
> >      return 1;
> >  }
> >
> > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs)
> >      rs->last_version = ram_list.version;
> >      rs->ram_bulk_stage = true;
> >      rs->fpo_enabled = false;
> > +    rs->metadata_enabled = false;
> >  }
> >
> >  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f)
> >              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
> >          }
> >
> > -        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
> > +        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {
>
> Creating the flag is hurting you here also.
>
> >          case RAM_SAVE_FLAG_MEM_SIZE:
> >              /* Synchronize RAM block list */
> >              total_ram_bytes = addr;
> > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f)
> >
> >          case RAM_SAVE_FLAG_PAGE:
> >              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> > +            if (flags & RAM_SAVE_FLAG_MTE) {
> > +                load_normal_page_mte_tags(f, host);
> > +            }
>
> I don't claim to understand the MTE, but my understanding is that if we
> are using MTE, all pages have to have MTE flags, right?
>
> So, somtehing like
>
> is_mte_enabled()
>
> that I told in the other thread looks like a good idea.
>

Yes, we do need a function like this.

> Later, Juan.
>
> >              break;
> >
> >          case RAM_SAVE_FLAG_COMPRESS_PAGE:
>
Haibo Xu April 1, 2021, 1:33 p.m. UTC | #6
On Fri, 26 Mar 2021 at 03:38, Dr. David Alan Gilbert
<dgilbert@redhat.com> wrote:
>
> * Haibo Xu (haibo.xu@linaro.org) wrote:
> > On Thu, 18 Mar 2021 at 04:11, Dr. David Alan Gilbert
> > <dgilbert@redhat.com> wrote:
> > >
> > > * Haibo Xu (haibo.xu@linaro.org) wrote:
> > > > To make it easier to keep the page tags sync with
> > > > the page data, tags for one page are appended to
> > > > the data during ram save iteration.
> > > >
> > > > This patch only add the pre-copy migration support.
> > > > Post-copy and compress as well as zero page saving
> > > > are not supported yet.
> > > >
> > > > Signed-off-by: Haibo Xu <haibo.xu@linaro.org>
> > >
> > > My guess is that this doesn't work with a lot of other options; e.g.
> > > postcopy and probably compression and a bunch of other things.
> > > Postcopy I can see you'll need some interesting kernel changes for -
> > > you'd need to be able to atomically place a  page with it's tag data.
> > >
> > > You probably need to add stuff to migrate_caps_check  to disable
> > > features that you don't support.
> > >
> >
> > Hi David,
> >
> > Thanks so much for the comments!
> >
> > You are right, this RFC patch only supports pre-copy mode, no
> > postcopy, no compression.
> > As a RFC, here just want to finalize the tag migration process, that is:
> > 1. let the tag go with the page data(the current choice) which may be
> > a little complex to put
> >     them into the current migration process.
>
> I think it's probably the easiest.
>

Agree!

> > 2. migrate them separately which is easy to implement with the current
> > migration(treat the tags
> >     as device status), but it would be hard to keep the page data and
> > tag to sync with each other.
> > 3. Any other ways?
> >
> > Once the tag migration process is finalized, a new formal patch series
> > with postcopy as well as
> > compression should be reworked.
>
> It's probably best to look what you would have to do to fit it in with
> those before finalising the basic version; not necessary implement them
> - just get an idea.
>

Thanks for the suggestion!

> > What's more, you mentioned that "some interesting kernel changes are
> > needed to atomically
> > place a  page with it's tag data". You mean a single kernel API to
> > store page data and tag in
> > the migration load process?
>
> Yes;  see migration/postcopy-ram.c postcopy_place_page  - it has the
> problem of placing the whole page in memory before the guest starts
> usign it; I think for MTE you're going to have to put the MTE data in at
> the same time.
>

Good point! Will check this with the kernel guys.

> Dave
>
> >
> > Regards,
> > Haibo
> >
> > > > ---
> > > >  include/hw/arm/virt.h    |  2 +
> > > >  include/migration/misc.h |  1 +
> > > >  migration/ram.c          | 86 +++++++++++++++++++++++++++++++++++++++-
> > > >  3 files changed, 88 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> > > > index 921416f918..8b28cde8bf 100644
> > > > --- a/include/hw/arm/virt.h
> > > > +++ b/include/hw/arm/virt.h
> > > > @@ -166,6 +166,8 @@ struct VirtMachineState {
> > > >      PCIBus *bus;
> > > >      char *oem_id;
> > > >      char *oem_table_id;
> > > > +    /* migrate memory tags */
> > > > +    NotifierWithReturn precopy_notifier;
> > > >  };
> > > >
> > > >  #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
> > > > diff --git a/include/migration/misc.h b/include/migration/misc.h
> > > > index bccc1b6b44..005133f471 100644
> > > > --- a/include/migration/misc.h
> > > > +++ b/include/migration/misc.h
> > > > @@ -38,6 +38,7 @@ void precopy_add_notifier(NotifierWithReturn *n);
> > > >  void precopy_remove_notifier(NotifierWithReturn *n);
> > > >  int precopy_notify(PrecopyNotifyReason reason, Error **errp);
> > > >  void precopy_enable_free_page_optimization(void);
> > > > +void precopy_enable_metadata_migration(void);
> > > >
> > > >  void ram_mig_init(void);
> > > >  void qemu_guest_free_page_hint(void *addr, size_t len);
> > > > diff --git a/migration/ram.c b/migration/ram.c
> > > > index 72143da0ac..e67b798c3b 100644
> > > > --- a/migration/ram.c
> > > > +++ b/migration/ram.c
> > > > @@ -53,10 +53,12 @@
> > > >  #include "block.h"
> > > >  #include "sysemu/sysemu.h"
> > > >  #include "sysemu/cpu-throttle.h"
> > > > +#include "sysemu/kvm.h"
> > > >  #include "savevm.h"
> > > >  #include "qemu/iov.h"
> > > >  #include "multifd.h"
> > > >  #include "sysemu/runstate.h"
> > > > +#include "kvm_arm.h"
> > > >
> > > >  #if defined(__linux__)
> > > >  #include "qemu/userfaultfd.h"
> > > > @@ -80,6 +82,9 @@
> > > >  #define RAM_SAVE_FLAG_XBZRLE   0x40
> > > >  /* 0x80 is reserved in migration.h start with 0x100 next */
> > > >  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
> > > > +#define RAM_SAVE_FLAG_MTE              0x200
> > >
> > > I think that's using the last free bit in the flags field, which is a
> > > bit painful.
> > >
> >
> > Yes, only 0x80(reserved) and 0x200 are available.
> >
> > > > +#define MTE_GRANULE_SIZE   (16)
> > > >
> > > >  static inline bool is_zero_range(uint8_t *p, uint64_t size)
> > > >  {
> > > > @@ -317,6 +322,8 @@ struct RAMState {
> > > >      bool ram_bulk_stage;
> > > >      /* The free page optimization is enabled */
> > > >      bool fpo_enabled;
> > > > +    /* The RAM meta data(e.t memory tag) is enabled */
> > > > +    bool metadata_enabled;
> > > >      /* How many times we have dirty too many pages */
> > > >      int dirty_rate_high_cnt;
> > > >      /* these variables are used for bitmap sync */
> > > > @@ -394,6 +401,15 @@ void precopy_enable_free_page_optimization(void)
> > > >      ram_state->fpo_enabled = true;
> > > >  }
> > > >
> > > > +void precopy_enable_metadata_migration(void)
> > > > +{
> > > > +    if (!ram_state) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    ram_state->metadata_enabled = true;
> > > > +}
> > > > +
> > > >  uint64_t ram_bytes_remaining(void)
> > > >  {
> > > >      return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
> > > > @@ -1134,6 +1150,61 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > > >      return true;
> > > >  }
> > > >
> > > > +static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > > > +{
> > > > +    uint8_t *tag_buf = NULL;
> > > > +    uint64_t ipa;
> > > > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> > >
> > > This function needs to be mostly somewhere aarch specific; or somehow
> > > made more generic.
> > > We shouldn't have RAM_SAVE_FLAG_MTE as well - we should be something
> > > like RAM_SAVE_FLAG_ARCH_METADATA and that way other architectures with
> > > something else glued onto pages can do something similar.
> > > Try and keep migration/ architecture independent.
> >
> > Yes, it should be arch independent and handled with a more abstract way.
> >
> > >
> > > > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > > > +        /* Buffer for the page tags(one byte per tag) */
> > > > +        tag_buf = g_try_malloc0(size);
> > >
> > > It feels like you want to allocate tag_buf in setup and free it at the
> > > end rather than doing this in every page.
> >
> > Yes, the tag buffer allocation could be moved to migration save/load
> > setup and cleanup function.
> >
> > >
> > > > +        if (!tag_buf) {
> > > > +            error_report("%s: Error allocating MTE tag_buf", __func__);
> > > > +            return 0;
> > > > +        }
> > > > +
> > > > +        if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
> > > > +            error_report("%s: Can't get MTE tags from guest", __func__);
> > >
> > > For any error like this you probably want to say the addresses to make
> > > it easier to debug when it fails.
> > >
> >
> > Good suggestion! Will add the "address" info to the log.
> >
> > > > +            g_free(tag_buf);
> > > > +            return 0;
> > > > +        }
> > > > +
> > > > +        qemu_put_buffer(f, tag_buf, size);
> > > > +
> > > > +        g_free(tag_buf);
> > > > +
> > > > +        return size;
> > > > +    }
> > > > +
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
> > > > +{
> > > > +    uint8_t *tag_buf = NULL;
> > > > +    uint64_t ipa;
> > > > +    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
> > > > +
> > > > +    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
> > > > +        /* Buffer for the page tags(one byte per tag) */
> > > > +        tag_buf = g_try_malloc0(size);
> > > > +        if (!tag_buf) {
> > > > +            error_report("%s: Error allocating MTE tag_buf", __func__);
> > > > +            return;
> > > > +        }
> > > > +
> > > > +        qemu_get_buffer(f, tag_buf, size);
> > > > +        if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
> > >
> > > what protections are there here to stop you setting the mte on something
> > > useful, like part of the host kernel or qemu?
> >
> > Kernel would do some parameter check(e.t ipa within a valid slot)
> > before setting the tag data.
> >
> > >
> > > > +            error_report("%s: Can't set MTE tags to guest", __func__);
> > > > +        }
> > > > +
> > > > +        g_free(tag_buf);
> > > > +    }
> > > > +
> > > > +    return;
> > > > +}
> > > > +
> > > >  /*
> > > >   * directly send the page to the stream
> > > >   *
> > > > @@ -1148,6 +1219,10 @@ static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > > >  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > > >                              uint8_t *buf, bool async)
> > > >  {
> > > > +    if (rs->metadata_enabled) {
> > > > +        offset |= RAM_SAVE_FLAG_MTE;
> > > > +    }
> > > > +
> > > >      ram_counters.transferred += save_page_header(rs, rs->f, block,
> > > >                                                   offset | RAM_SAVE_FLAG_PAGE);
> > > >      if (async) {
> > > > @@ -1159,6 +1234,11 @@ static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
> > > >      }
> > > >      ram_counters.transferred += TARGET_PAGE_SIZE;
> > > >      ram_counters.normal++;
> > > > +
> > > > +    if (rs->metadata_enabled) {
> > > > +        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
> > > > +    }
> > > > +
> > > >      return 1;
> > > >  }
> > > >
> > > > @@ -2189,6 +2269,7 @@ static void ram_state_reset(RAMState *rs)
> > > >      rs->last_version = ram_list.version;
> > > >      rs->ram_bulk_stage = true;
> > > >      rs->fpo_enabled = false;
> > > > +    rs->metadata_enabled = false;
> > > >  }
> > > >
> > > >  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> > > > @@ -3779,7 +3860,7 @@ static int ram_load_precopy(QEMUFile *f)
> > > >              trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
> > > >          }
> > > >
> > > > -        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
> > > > +        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {
> > > >          case RAM_SAVE_FLAG_MEM_SIZE:
> > > >              /* Synchronize RAM block list */
> > > >              total_ram_bytes = addr;
> > > > @@ -3849,6 +3930,9 @@ static int ram_load_precopy(QEMUFile *f)
> > > >
> > > >          case RAM_SAVE_FLAG_PAGE:
> > > >              qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
> > > > +            if (flags & RAM_SAVE_FLAG_MTE) {
> > > > +                load_normal_page_mte_tags(f, host);
> > > > +            }
> > > >              break;
> > > >
> > > >          case RAM_SAVE_FLAG_COMPRESS_PAGE:
> > > > --
> > > > 2.17.1
> > > >
> > > --
> > > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > >
> >
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
diff mbox series

Patch

diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 921416f918..8b28cde8bf 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -166,6 +166,8 @@  struct VirtMachineState {
     PCIBus *bus;
     char *oem_id;
     char *oem_table_id;
+    /* migrate memory tags */
+    NotifierWithReturn precopy_notifier;
 };
 
 #define VIRT_ECAM_ID(high) (high ? VIRT_HIGH_PCIE_ECAM : VIRT_PCIE_ECAM)
diff --git a/include/migration/misc.h b/include/migration/misc.h
index bccc1b6b44..005133f471 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -38,6 +38,7 @@  void precopy_add_notifier(NotifierWithReturn *n);
 void precopy_remove_notifier(NotifierWithReturn *n);
 int precopy_notify(PrecopyNotifyReason reason, Error **errp);
 void precopy_enable_free_page_optimization(void);
+void precopy_enable_metadata_migration(void);
 
 void ram_mig_init(void);
 void qemu_guest_free_page_hint(void *addr, size_t len);
diff --git a/migration/ram.c b/migration/ram.c
index 72143da0ac..e67b798c3b 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -53,10 +53,12 @@ 
 #include "block.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/cpu-throttle.h"
+#include "sysemu/kvm.h"
 #include "savevm.h"
 #include "qemu/iov.h"
 #include "multifd.h"
 #include "sysemu/runstate.h"
+#include "kvm_arm.h"
 
 #if defined(__linux__)
 #include "qemu/userfaultfd.h"
@@ -80,6 +82,9 @@ 
 #define RAM_SAVE_FLAG_XBZRLE   0x40
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
+#define RAM_SAVE_FLAG_MTE              0x200
+
+#define MTE_GRANULE_SIZE   (16)
 
 static inline bool is_zero_range(uint8_t *p, uint64_t size)
 {
@@ -317,6 +322,8 @@  struct RAMState {
     bool ram_bulk_stage;
     /* The free page optimization is enabled */
     bool fpo_enabled;
+    /* The RAM meta data(e.t memory tag) is enabled */
+    bool metadata_enabled;
     /* How many times we have dirty too many pages */
     int dirty_rate_high_cnt;
     /* these variables are used for bitmap sync */
@@ -394,6 +401,15 @@  void precopy_enable_free_page_optimization(void)
     ram_state->fpo_enabled = true;
 }
 
+void precopy_enable_metadata_migration(void)
+{
+    if (!ram_state) {
+        return;
+    }
+
+    ram_state->metadata_enabled = true;
+}
+
 uint64_t ram_bytes_remaining(void)
 {
     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
@@ -1134,6 +1150,61 @@  static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
     return true;
 }
 
+static int save_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
+{
+    uint8_t *tag_buf = NULL;
+    uint64_t ipa;
+    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
+
+    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
+        /* Buffer for the page tags(one byte per tag) */
+        tag_buf = g_try_malloc0(size);
+        if (!tag_buf) {
+            error_report("%s: Error allocating MTE tag_buf", __func__);
+            return 0;
+        }
+
+        if (kvm_arm_mte_get_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
+            error_report("%s: Can't get MTE tags from guest", __func__);
+            g_free(tag_buf);
+            return 0;
+        }
+
+        qemu_put_buffer(f, tag_buf, size);
+
+        g_free(tag_buf);
+
+        return size;
+    }
+
+    return 0;
+}
+
+static void load_normal_page_mte_tags(QEMUFile *f, uint8_t *addr)
+{
+    uint8_t *tag_buf = NULL;
+    uint64_t ipa;
+    int size = TARGET_PAGE_SIZE / MTE_GRANULE_SIZE;
+
+    if (kvm_physical_memory_addr_from_host(kvm_state, addr, &ipa)) {
+        /* Buffer for the page tags(one byte per tag) */
+        tag_buf = g_try_malloc0(size);
+        if (!tag_buf) {
+            error_report("%s: Error allocating MTE tag_buf", __func__);
+            return;
+        }
+
+        qemu_get_buffer(f, tag_buf, size);
+        if (kvm_arm_mte_set_tags(ipa, TARGET_PAGE_SIZE, tag_buf) < 0) {
+            error_report("%s: Can't set MTE tags to guest", __func__);
+        }
+
+        g_free(tag_buf);
+    }
+
+    return;
+}
+
 /*
  * directly send the page to the stream
  *
@@ -1148,6 +1219,10 @@  static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
                             uint8_t *buf, bool async)
 {
+    if (rs->metadata_enabled) {
+        offset |= RAM_SAVE_FLAG_MTE;
+    }
+
     ram_counters.transferred += save_page_header(rs, rs->f, block,
                                                  offset | RAM_SAVE_FLAG_PAGE);
     if (async) {
@@ -1159,6 +1234,11 @@  static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
     }
     ram_counters.transferred += TARGET_PAGE_SIZE;
     ram_counters.normal++;
+
+    if (rs->metadata_enabled) {
+        ram_counters.transferred += save_normal_page_mte_tags(rs->f, buf);
+    }
+
     return 1;
 }
 
@@ -2189,6 +2269,7 @@  static void ram_state_reset(RAMState *rs)
     rs->last_version = ram_list.version;
     rs->ram_bulk_stage = true;
     rs->fpo_enabled = false;
+    rs->metadata_enabled = false;
 }
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -3779,7 +3860,7 @@  static int ram_load_precopy(QEMUFile *f)
             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
         }
 
-        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+        switch (flags & ~(RAM_SAVE_FLAG_CONTINUE | RAM_SAVE_FLAG_MTE)) {
         case RAM_SAVE_FLAG_MEM_SIZE:
             /* Synchronize RAM block list */
             total_ram_bytes = addr;
@@ -3849,6 +3930,9 @@  static int ram_load_precopy(QEMUFile *f)
 
         case RAM_SAVE_FLAG_PAGE:
             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
+            if (flags & RAM_SAVE_FLAG_MTE) {
+                load_normal_page_mte_tags(f, host);
+            }
             break;
 
         case RAM_SAVE_FLAG_COMPRESS_PAGE: