diff mbox

[v4,31/47] postcopy: Incoming initialisation

Message ID 1412358473-31398-32-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert Oct. 3, 2014, 5:47 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 arch_init.c                      |  11 ++++
 include/migration/migration.h    |   1 +
 include/migration/postcopy-ram.h |  12 +++++
 migration.c                      |   1 +
 postcopy-ram.c                   | 110 ++++++++++++++++++++++++++++++++++++++-
 savevm.c                         |   4 ++
 6 files changed, 138 insertions(+), 1 deletion(-)

Comments

David Gibson Nov. 5, 2014, 6:47 a.m. UTC | #1
On Fri, Oct 03, 2014 at 06:47:37PM +0100, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  arch_init.c                      |  11 ++++
>  include/migration/migration.h    |   1 +
>  include/migration/postcopy-ram.h |  12 +++++
>  migration.c                      |   1 +
>  postcopy-ram.c                   | 110 ++++++++++++++++++++++++++++++++++++++-
>  savevm.c                         |   4 ++
>  6 files changed, 138 insertions(+), 1 deletion(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index 030d189..4a03171 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -1345,6 +1345,17 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
>      }
>  }
>  
> +/*
> + * Allocate data structures etc needed by incoming migration with postcopy-ram
> + * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
> + */
> +int ram_postcopy_incoming_init(MigrationIncomingState *mis)
> +{
> +    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
> +
> +    return postcopy_ram_incoming_init(mis, ram_pages);
> +}

Um.. yeah.  I'm sure ram_postcopy_incoming_init versus
postcopy_ram_incoming_init won't get confusing o_O.

[snip]
> +/*
> + * Setup an area of RAM so that it *can* be used for postcopy later; this
> + * must be done right at the start prior to pre-copy.
> + * opaque should be the MIS.
> + */
> +static int init_area(const char *block_name, void *host_addr,
> +                     ram_addr_t offset, ram_addr_t length, void *opaque)
> +{
> +    MigrationIncomingState *mis = opaque;
> +
> +    DPRINTF("init_area: %s: %p offset=%zx length=%zd(%zx)",
> +            block_name, host_addr, offset, length, length);
> +    /*
> +     * We need the whole of RAM to be truly empty for postcopy, so things
> +     * like ROMs and any data tables built during init must be zero'd
> +     * - we're going to get the copy from the source anyway.
> +     */
> +    if (postcopy_ram_discard_range(mis, host_addr, (host_addr + length - 1))) {
> +        return -1;
> +    }
> +
> +    /*
> +     * We also need the area to be normal 4k pages, not huge pages
> +     * (otherwise we can't be sure we can use remap_anon_pages to put
> +     * a 4k page in later).  THP might come along and map a 2MB page
> +     * and when it's partially accessed in precopy it might not break
> +     * it down, but leave a 2MB zero'd page.
> +     */
> +    if (madvise(host_addr, length, MADV_NOHUGEPAGE)) {
> +        perror("init_area: NOHUGEPAGE");
> +        return -1;
> +    }

I'm assuming this is because remap_anon_pages() can't automatically
split a THP itself.  It's not immediately obvious to me why it can't
though.

Also.. what effect will this have on an actual hugetlbfs memory
region?  If there's code to handle that case I haven't spotted it yet.

> +
> +    return 0;
> +}
> +
> +/*
> + * At the end of migration, undo the effects of init_area
> + * opaque should be the MIS.
> + */
> +static int cleanup_area(const char *block_name, void *host_addr,
> +                        ram_addr_t offset, ram_addr_t length, void *opaque)
> +{
> +    /* Turn off userfault here as well? */

This comment appears to be obsoleted by the code below.

> +
> +    DPRINTF("cleanup_area: %s: %p offset=%zx length=%zd(%zx)",
> +            block_name, host_addr, offset, length, length);
> +    /*
> +     * We turned off hugepage for the precopy stage with postcopy enabled
> +     * we can turn it back on now.
> +     */
> +    if (madvise(host_addr, length, MADV_HUGEPAGE)) {
> +        perror("init_area: HUGEPAGE");
> +        return -1;
> +    }
> +
> +    /*
> +     * We can also turn off userfault now since we should have all the
> +     * pages.   It can be useful to leave it on to debug postcopy
> +     * if you're not sure it's always getting every page.
> +     */
> +    if (madvise(host_addr, length, MADV_NOUSERFAULT)) {
> +        perror("init_area: NOUSERFAULT");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Initialise postcopy-ram, setting the RAM to a state where we can go into
> + * postcopy later; must be called prior to any precopy.
> + * called from arch_init's similarly named ram_postcopy_incoming_init
> + */
> +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
> +{
> +    postcopy_pmi_init(mis, ram_pages);
> +
> +    if (qemu_ram_foreach_block(init_area, mis)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * At the end of a migration where postcopy_ram_incoming_init was called.
> + */
> +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
> +{
> +    /* TODO: Join the fault thread once we're sure it will exit */
> +    if (qemu_ram_foreach_block(cleanup_area, mis)) {
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>  #else
>  /* No target OS support, stubs just fail */
>  
> @@ -404,6 +501,17 @@ void postcopy_hook_early_receive(MigrationIncomingState *mis,
>      /* We don't support postcopy so don't care */
>  }
>  
> +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
> +{
> +    error_report("postcopy_ram_incoming_init: No OS support");
> +    return -1;
> +}
> +
> +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
> +{
> +    assert(0);
> +}
> +
>  void postcopy_pmi_destroy(MigrationIncomingState *mis)
>  {
>      /* Called in normal cleanup path - so it's OK */
> diff --git a/savevm.c b/savevm.c
> index 7f9e0b2..54bdb26 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -1166,6 +1166,10 @@ static int loadvm_postcopy_ram_handle_advise(MigrationIncomingState *mis,
>          return -1;
>      }
>  
> +    if (ram_postcopy_incoming_init(mis)) {
> +        return -1;
> +    }
> +
>      mis->postcopy_ram_state = POSTCOPY_RAM_INCOMING_ADVISE;
>  
>      /*
Dr. David Alan Gilbert Dec. 17, 2014, 5:21 p.m. UTC | #2
* David Gibson (david@gibson.dropbear.id.au) wrote:
> On Fri, Oct 03, 2014 at 06:47:37PM +0100, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  arch_init.c                      |  11 ++++
> >  include/migration/migration.h    |   1 +
> >  include/migration/postcopy-ram.h |  12 +++++
> >  migration.c                      |   1 +
> >  postcopy-ram.c                   | 110 ++++++++++++++++++++++++++++++++++++++-
> >  savevm.c                         |   4 ++
> >  6 files changed, 138 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch_init.c b/arch_init.c
> > index 030d189..4a03171 100644
> > --- a/arch_init.c
> > +++ b/arch_init.c
> > @@ -1345,6 +1345,17 @@ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
> >      }
> >  }
> >  
> > +/*
> > + * Allocate data structures etc needed by incoming migration with postcopy-ram
> > + * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
> > + */
> > +int ram_postcopy_incoming_init(MigrationIncomingState *mis)
> > +{
> > +    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
> > +
> > +    return postcopy_ram_incoming_init(mis, ram_pages);
> > +}
> 
> Um.. yeah.  I'm sure ram_postcopy_incoming_init versus
> postcopy_ram_incoming_init won't get confusing o_O.

agreed; that's why I put the comments on.  My problem here is that:
  1) last_ram_offset() comes from code that's poisoned so it can't be built in
     a target independent file
  2) I'd managed so far (with a couple of hacks) to keep postcopy_ram.c
     target independent.
  3) ram_ is the prefix for external names in arch_init.c
  4) postcopy_ram_ is the prefix for external names in postcopy_ram.c

If I threw in the towel and made postcopy_ram target dependent it would
remove the need for that wrapper; it might be the best bet.
(Other naming suggestions also welcome)

> [snip]
> > +/*
> > + * Setup an area of RAM so that it *can* be used for postcopy later; this
> > + * must be done right at the start prior to pre-copy.
> > + * opaque should be the MIS.
> > + */
> > +static int init_area(const char *block_name, void *host_addr,
> > +                     ram_addr_t offset, ram_addr_t length, void *opaque)
> > +{
> > +    MigrationIncomingState *mis = opaque;
> > +
> > +    DPRINTF("init_area: %s: %p offset=%zx length=%zd(%zx)",
> > +            block_name, host_addr, offset, length, length);
> > +    /*
> > +     * We need the whole of RAM to be truly empty for postcopy, so things
> > +     * like ROMs and any data tables built during init must be zero'd
> > +     * - we're going to get the copy from the source anyway.
> > +     */
> > +    if (postcopy_ram_discard_range(mis, host_addr, (host_addr + length - 1))) {
> > +        return -1;
> > +    }
> > +
> > +    /*
> > +     * We also need the area to be normal 4k pages, not huge pages
> > +     * (otherwise we can't be sure we can use remap_anon_pages to put
> > +     * a 4k page in later).  THP might come along and map a 2MB page
> > +     * and when it's partially accessed in precopy it might not break
> > +     * it down, but leave a 2MB zero'd page.
> > +     */
> > +    if (madvise(host_addr, length, MADV_NOHUGEPAGE)) {
> > +        perror("init_area: NOHUGEPAGE");
> > +        return -1;
> > +    }
> 
> I'm assuming this is because remap_anon_pages() can't automatically
> split a THP itself.  It's not immediately obvious to me why it can't
> though.

No, I think this restriction stems from two things:
   1) remap_anon_pages not allowing us to map into an area that's already
   got a page present - it's a good protection mechanism against us
   doing something stupid and receiving a page that we already received
   and the destination is busy accessing.

   2) We wouldn't want THP to decide to convert a page that we'd only
   partially received into a HP because we wouldn't then receive userfault
   messages for it.

(Although it might be best to check with Andrea).
(1) might disappear with the modifications to replace remap_anon_pages
that Andrea is working on.

> Also.. what effect will this have on an actual hugetlbfs memory
> region?  If there's code to handle that case I haven't spotted it yet.

I wouldn't expect this code to work with hugetlbfs mappings.

> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * At the end of migration, undo the effects of init_area
> > + * opaque should be the MIS.
> > + */
> > +static int cleanup_area(const char *block_name, void *host_addr,
> > +                        ram_addr_t offset, ram_addr_t length, void *opaque)
> > +{
> > +    /* Turn off userfault here as well? */
> 
> This comment appears to be obsoleted by the code below.

Thanks; I've squashed it.

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/arch_init.c b/arch_init.c
index 030d189..4a03171 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -1345,6 +1345,17 @@  void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
     }
 }
 
+/*
+ * Allocate data structures etc needed by incoming migration with postcopy-ram
+ * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
+ */
+int ram_postcopy_incoming_init(MigrationIncomingState *mis)
+{
+    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    return postcopy_ram_incoming_init(mis, ram_pages);
+}
+
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
     ram_addr_t addr;
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 73d338e..be63c89 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -203,6 +203,7 @@  int ram_postcopy_send_discard_bitmap(MigrationState *ms);
 /* For incoming postcopy discard */
 int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
                       uint64_t start, uint64_t end);
+int ram_postcopy_incoming_init(MigrationIncomingState *mis);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
index 2a39a03..8f237a2 100644
--- a/include/migration/postcopy-ram.h
+++ b/include/migration/postcopy-ram.h
@@ -19,6 +19,18 @@ 
 int postcopy_ram_hosttest(void);
 
 /*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages);
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis);
+
+/*
  * In 'advise' mode record that a page has been received.
  */
 void postcopy_hook_early_receive(MigrationIncomingState *mis,
diff --git a/migration.c b/migration.c
index db860c9..63d70b6 100644
--- a/migration.c
+++ b/migration.c
@@ -99,6 +99,7 @@  MigrationIncomingState *migration_incoming_state_init(QEMUFile* f)
 
 void migration_incoming_state_destroy(void)
 {
+    postcopy_pmi_destroy(mis_current);
     g_free(mis_current);
     mis_current = NULL;
 }
diff --git a/postcopy-ram.c b/postcopy-ram.c
index 76f992f..8eccf26 100644
--- a/postcopy-ram.c
+++ b/postcopy-ram.c
@@ -104,7 +104,6 @@  struct PostcopyDiscardState {
 /* It's a pair of bitmaps (of the same structure as the migration bitmaps)*/
 /* holding one bit per target-page, although all operations work on host  */
 /* pages.                                                                 */
-__attribute__ (( unused )) /* Until later in patch series */
 static void postcopy_pmi_init(MigrationIncomingState *mis, size_t ram_pages)
 {
     unsigned int tpb = qemu_target_page_bits();
@@ -388,6 +387,104 @@  int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
     return 0;
 }
 
+/*
+ * Setup an area of RAM so that it *can* be used for postcopy later; this
+ * must be done right at the start prior to pre-copy.
+ * opaque should be the MIS.
+ */
+static int init_area(const char *block_name, void *host_addr,
+                     ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+
+    DPRINTF("init_area: %s: %p offset=%zx length=%zd(%zx)",
+            block_name, host_addr, offset, length, length);
+    /*
+     * We need the whole of RAM to be truly empty for postcopy, so things
+     * like ROMs and any data tables built during init must be zero'd
+     * - we're going to get the copy from the source anyway.
+     */
+    if (postcopy_ram_discard_range(mis, host_addr, (host_addr + length - 1))) {
+        return -1;
+    }
+
+    /*
+     * We also need the area to be normal 4k pages, not huge pages
+     * (otherwise we can't be sure we can use remap_anon_pages to put
+     * a 4k page in later).  THP might come along and map a 2MB page
+     * and when it's partially accessed in precopy it might not break
+     * it down, but leave a 2MB zero'd page.
+     */
+    if (madvise(host_addr, length, MADV_NOHUGEPAGE)) {
+        perror("init_area: NOHUGEPAGE");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of migration, undo the effects of init_area
+ * opaque should be the MIS.
+ */
+static int cleanup_area(const char *block_name, void *host_addr,
+                        ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    /* Turn off userfault here as well? */
+
+    DPRINTF("cleanup_area: %s: %p offset=%zx length=%zd(%zx)",
+            block_name, host_addr, offset, length, length);
+    /*
+     * We turned off hugepage for the precopy stage with postcopy enabled
+     * we can turn it back on now.
+     */
+    if (madvise(host_addr, length, MADV_HUGEPAGE)) {
+        perror("init_area: HUGEPAGE");
+        return -1;
+    }
+
+    /*
+     * We can also turn off userfault now since we should have all the
+     * pages.   It can be useful to leave it on to debug postcopy
+     * if you're not sure it's always getting every page.
+     */
+    if (madvise(host_addr, length, MADV_NOUSERFAULT)) {
+        perror("init_area: NOUSERFAULT");
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    postcopy_pmi_init(mis, ram_pages);
+
+    if (qemu_ram_foreach_block(init_area, mis)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    /* TODO: Join the fault thread once we're sure it will exit */
+    if (qemu_ram_foreach_block(cleanup_area, mis)) {
+        return -1;
+    }
+
+    return 0;
+}
+
 #else
 /* No target OS support, stubs just fail */
 
@@ -404,6 +501,17 @@  void postcopy_hook_early_receive(MigrationIncomingState *mis,
     /* We don't support postcopy so don't care */
 }
 
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    error_report("postcopy_ram_incoming_init: No OS support");
+    return -1;
+}
+
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    assert(0);
+}
+
 void postcopy_pmi_destroy(MigrationIncomingState *mis)
 {
     /* Called in normal cleanup path - so it's OK */
diff --git a/savevm.c b/savevm.c
index 7f9e0b2..54bdb26 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1166,6 +1166,10 @@  static int loadvm_postcopy_ram_handle_advise(MigrationIncomingState *mis,
         return -1;
     }
 
+    if (ram_postcopy_incoming_init(mis)) {
+        return -1;
+    }
+
     mis->postcopy_ram_state = POSTCOPY_RAM_INCOMING_ADVISE;
 
     /*