diff mbox

[v8,33/54] postcopy: Incoming initialisation

Message ID 1443515898-3594-34-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert Sept. 29, 2015, 8:37 a.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Reviewed-by: Amit Shah <amit.shah@redhat.com>
---
 include/migration/migration.h    |   3 ++
 include/migration/postcopy-ram.h |  12 +++++
 migration/postcopy-ram.c         | 102 +++++++++++++++++++++++++++++++++++++++
 migration/ram.c                  |  11 +++++
 migration/savevm.c               |   6 +++
 trace-events                     |   2 +
 6 files changed, 136 insertions(+)

Comments

Juan Quintela Oct. 21, 2015, 8:35 a.m. UTC | #1
"Dr. David Alan Gilbert (git)" <dgilbert@redhat.com> wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> Reviewed-by: Amit Shah <amit.shah@redhat.com>

> +/*
> + * At the end of migration, undo the effects of init_range
> + * opaque should be the MIS.
> + */
> +static int cleanup_range(const char *block_name, void *host_addr,
> +                        ram_addr_t offset, ram_addr_t length, void *opaque)
> +{
> +    MigrationIncomingState *mis = opaque;
> +    struct uffdio_range range_struct;
> +    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
> +
> +    /*
> +     * We turned off hugepage for the precopy stage with postcopy enabled
> +     * we can turn it back on now.
> +     */
> +#ifdef MADV_HUGEPAGE
> +    if (madvise(host_addr, length, MADV_HUGEPAGE)) {
> +        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
> +        return -1;
> +    }
> +#endif

this should be the same than:

       qemu_madvise(host_addr, lenght, QEMU_MADV_HUGEPAGE);

Only problem I can see, is that there is no way to differentiate that
madvise() has given one error or that MADV_HUGEPAGE is not defined.

If we really want that:

   if (QEMU_MADV_HUGEPAGE != QEM_MADV_INVALID) {
      if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
        return -1;
   }

But I am not sure if we want it.


> +
> +    /*
> +     * We can also turn off userfault now since we should have all the
> +     * pages.   It can be useful to leave it on to debug postcopy
> +     * if you're not sure it's always getting every page.
> +     */
> +    range_struct.start = (uintptr_t)host_addr;
> +    range_struct.len = length;
> +
> +    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
> +        error_report("%s: userfault unregister %s", __func__, strerror(errno));
> +
> +        return -1;
> +    }
> +
> +    return 0;
> +}


I still think that exposing the userfault API all around is a bad idea,
that it would be easier to just export:

qemu_userfault_register_range(addr, lenght);
qemu_userfault_unregister_range(addr, lenght);

And hide the details on a header file.

Later, Juan.
Dr. David Alan Gilbert Nov. 3, 2015, 5:59 p.m. UTC | #2
* Juan Quintela (quintela@redhat.com) wrote:
> "Dr. David Alan Gilbert (git)" <dgilbert@redhat.com> wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> >
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> > Reviewed-by: Amit Shah <amit.shah@redhat.com>
> 
> > +/*
> > + * At the end of migration, undo the effects of init_range
> > + * opaque should be the MIS.
> > + */
> > +static int cleanup_range(const char *block_name, void *host_addr,
> > +                        ram_addr_t offset, ram_addr_t length, void *opaque)
> > +{
> > +    MigrationIncomingState *mis = opaque;
> > +    struct uffdio_range range_struct;
> > +    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
> > +
> > +    /*
> > +     * We turned off hugepage for the precopy stage with postcopy enabled
> > +     * we can turn it back on now.
> > +     */
> > +#ifdef MADV_HUGEPAGE
> > +    if (madvise(host_addr, length, MADV_HUGEPAGE)) {
> > +        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
> > +        return -1;
> > +    }
> > +#endif
> 
> this should be the same than:
> 
>        qemu_madvise(host_addr, lenght, QEMU_MADV_HUGEPAGE);
> 
> Only problem I can see, is that there is no way to differentiate that
> madvise() has given one error or that MADV_HUGEPAGE is not defined.
> 
> If we really want that:
> 
>    if (QEMU_MADV_HUGEPAGE != QEM_MADV_INVALID) {
>       if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
>         error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
>         return -1;
>    }
> 
> But I am not sure if we want it.

Yes, so what I've currently got is:

    if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
        return -1;
    }

I'm tempted to add that if check, but the other similar case
is where you have headers that define HUGEPAGE, but a kernel built without
it, and in that case the madvise fails, which is a shame, since if the
kernel hasn't actually got transparent hugepages, then we don't
care if it failed to turn them on/off - but there doesn't seem
to be a good way to tell that.

> > +    /*
> > +     * We can also turn off userfault now since we should have all the
> > +     * pages.   It can be useful to leave it on to debug postcopy
> > +     * if you're not sure it's always getting every page.
> > +     */
> > +    range_struct.start = (uintptr_t)host_addr;
> > +    range_struct.len = length;
> > +
> > +    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
> > +        error_report("%s: userfault unregister %s", __func__, strerror(errno));
> > +
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> 
> 
> I still think that exposing the userfault API all around is a bad idea,
> that it would be easier to just export:
> 
> qemu_userfault_register_range(addr, lenght);
> qemu_userfault_unregister_range(addr, lenght);
> 
> And hide the details on a header file.

That only hides a tiny bit of the detail;
for example the ioctl's for UFFDIO_COPY and UFFDIO_ZEROPAGE, have semantics
associated with them (that they also wake the waiting process for example)
it's not obvious that another OS would implement it in a similar way
or what the constraints on it would be.  Indeed the previous kernel API
we had, meant I had to do a lot more work with a similar set of calls
in userspace.  Most of the places where we pull this out into separate
headers/libraries is where we have an interface that's the same across
a bunch of different OSs but the detail is different.   Currently we only
have one interaface and no idea what the commonality would be, or how
much of the semantics that's in postcopy-ram.c would need to move with
that interface as well.

Dave

> 
> Later, Juan.
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Juan Quintela Nov. 3, 2015, 6:32 p.m. UTC | #3
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:
> * Juan Quintela (quintela@redhat.com) wrote:
>> "Dr. David Alan Gilbert (git)" <dgilbert@redhat.com> wrote:
>> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
>> >
>> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>> > Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
>> > Reviewed-by: Amit Shah <amit.shah@redhat.com>
>> 
>> > +/*
>> > + * At the end of migration, undo the effects of init_range
>> > + * opaque should be the MIS.
>> > + */
>> > +static int cleanup_range(const char *block_name, void *host_addr,
>> > +                        ram_addr_t offset, ram_addr_t length, void *opaque)
>> > +{
>> > +    MigrationIncomingState *mis = opaque;
>> > +    struct uffdio_range range_struct;
>> > +    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
>> > +
>> > +    /*
>> > +     * We turned off hugepage for the precopy stage with postcopy enabled
>> > +     * we can turn it back on now.
>> > +     */
>> > +#ifdef MADV_HUGEPAGE
>> > +    if (madvise(host_addr, length, MADV_HUGEPAGE)) {
>> > +        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
>> > +        return -1;
>> > +    }
>> > +#endif
>> 
>> this should be the same than:
>> 
>>        qemu_madvise(host_addr, lenght, QEMU_MADV_HUGEPAGE);
>> 
>> Only problem I can see, is that there is no way to differentiate that
>> madvise() has given one error or that MADV_HUGEPAGE is not defined.
>> 
>> If we really want that:
>> 
>>    if (QEMU_MADV_HUGEPAGE != QEM_MADV_INVALID) {
>>       if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
>>         error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
>>         return -1;
>>    }
>> 
>> But I am not sure if we want it.
>
> Yes, so what I've currently got is:
>
>     if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
>         error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
>         return -1;
>     }
>
> I'm tempted to add that if check, but the other similar case
> is where you have headers that define HUGEPAGE, but a kernel built without
> it, and in that case the madvise fails, which is a shame, since if the
> kernel hasn't actually got transparent hugepages, then we don't
> care if it failed to turn them on/off - but there doesn't seem
> to be a good way to tell that.
>
>> > +    /*
>> > +     * We can also turn off userfault now since we should have all the
>> > +     * pages.   It can be useful to leave it on to debug postcopy
>> > +     * if you're not sure it's always getting every page.
>> > +     */
>> > +    range_struct.start = (uintptr_t)host_addr;
>> > +    range_struct.len = length;
>> > +
>> > +    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
>> > + error_report("%s: userfault unregister %s", __func__,
>> > strerror(errno));
>> > +
>> > +        return -1;
>> > +    }
>> > +
>> > +    return 0;
>> > +}
>> 
>> 
>> I still think that exposing the userfault API all around is a bad idea,
>> that it would be easier to just export:
>> 
>> qemu_userfault_register_range(addr, lenght);
>> qemu_userfault_unregister_range(addr, lenght);
>> 
>> And hide the details on a header file.
>
> That only hides a tiny bit of the detail;
> for example the ioctl's for UFFDIO_COPY and UFFDIO_ZEROPAGE, have semantics
> associated with them (that they also wake the waiting process for example)
> it's not obvious that another OS would implement it in a similar way
> or what the constraints on it would be.  Indeed the previous kernel API
> we had, meant I had to do a lot more work with a similar set of calls
> in userspace.  Most of the places where we pull this out into separate
> headers/libraries is where we have an interface that's the same across
> a bunch of different OSs but the detail is different.   Currently we only
> have one interaface and no idea what the commonality would be, or how
> much of the semantics that's in postcopy-ram.c would need to move with
> that interface as well.

ok, if it is too difficult (I didn't knew about the associated
semantics), we will wait until something else implemented a similar
interface.

Thanks, Juan.
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 4904d00..321ad1e 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -86,6 +86,8 @@  struct MigrationIncomingState {
      */
     QemuEvent main_thread_load_event;
 
+    /* For the kernel to send us notifications */
+    int       userfault_fd;
     QEMUFile *to_src_file;
     QemuMutex rp_mutex;    /* We send replies from multiple threads */
 
@@ -211,6 +213,7 @@  int ram_postcopy_send_discard_bitmap(MigrationState *ms);
 /* For incoming postcopy discard */
 int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
                       uint64_t start, size_t length);
+int ram_postcopy_incoming_init(MigrationIncomingState *mis);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
index 80ed2d9..9d98f7a 100644
--- a/include/migration/postcopy-ram.h
+++ b/include/migration/postcopy-ram.h
@@ -17,6 +17,18 @@ 
 bool postcopy_ram_supported_by_host(void);
 
 /*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from ram.c's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages);
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis);
+
+/*
  * Discard the contents of 'length' bytes from 'start'
  * We can assume that if we've been called postcopy_ram_hosttest returned true
  */
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 10c9cab..15ac820 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -184,6 +184,97 @@  int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
     return 0;
 }
 
+/*
+ * Setup an area of RAM so that it *can* be used for postcopy later; this
+ * must be done right at the start prior to pre-copy.
+ * opaque should be the MIS.
+ */
+static int init_range(const char *block_name, void *host_addr,
+                      ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+
+    trace_postcopy_init_range(block_name, host_addr, offset, length);
+
+    /*
+     * We need the whole of RAM to be truly empty for postcopy, so things
+     * like ROMs and any data tables built during init must be zero'd
+     * - we're going to get the copy from the source anyway.
+     * (Precopy will just overwrite this data, so doesn't need the discard)
+     */
+    if (postcopy_ram_discard_range(mis, host_addr, length)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of migration, undo the effects of init_range
+ * opaque should be the MIS.
+ */
+static int cleanup_range(const char *block_name, void *host_addr,
+                        ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffdio_range range_struct;
+    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
+
+    /*
+     * We turned off hugepage for the precopy stage with postcopy enabled
+     * we can turn it back on now.
+     */
+#ifdef MADV_HUGEPAGE
+    if (madvise(host_addr, length, MADV_HUGEPAGE)) {
+        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
+        return -1;
+    }
+#endif
+
+    /*
+     * We can also turn off userfault now since we should have all the
+     * pages.   It can be useful to leave it on to debug postcopy
+     * if you're not sure it's always getting every page.
+     */
+    range_struct.start = (uintptr_t)host_addr;
+    range_struct.len = length;
+
+    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
+        error_report("%s: userfault unregister %s", __func__, strerror(errno));
+
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    if (qemu_ram_foreach_block(init_range, mis)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    /* TODO: Join the fault thread once we're sure it will exit */
+    if (qemu_ram_foreach_block(cleanup_range, mis)) {
+        return -1;
+    }
+
+    return 0;
+}
+
 #else
 /* No target OS support, stubs just fail */
 bool postcopy_ram_supported_by_host(void)
@@ -192,6 +283,17 @@  bool postcopy_ram_supported_by_host(void)
     return false;
 }
 
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    error_report("postcopy_ram_incoming_init: No OS support");
+    return -1;
+}
+
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    assert(0);
+}
+
 int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
                                size_t length)
 {
diff --git a/migration/ram.c b/migration/ram.c
index e1c6c4a..d005aca 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1762,6 +1762,17 @@  static void decompress_data_with_multi_threads(uint8_t *compbuf,
     }
 }
 
+/*
+ * Allocate data structures etc needed by incoming migration with postcopy-ram
+ * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
+ */
+int ram_postcopy_incoming_init(MigrationIncomingState *mis)
+{
+    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    return postcopy_ram_incoming_init(mis, ram_pages);
+}
+
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
     int flags = 0, ret = 0;
diff --git a/migration/savevm.c b/migration/savevm.c
index 85462b1..11bf172 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1279,6 +1279,12 @@  static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
         return -1;
     }
 
+    if (ram_postcopy_incoming_init(mis)) {
+        return -1;
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+
     return 0;
 }
 
diff --git a/trace-events b/trace-events
index aa2d2e7..aa65d3d 100644
--- a/trace-events
+++ b/trace-events
@@ -1522,6 +1522,8 @@  rdma_start_outgoing_migration_after_rdma_source_init(void) ""
 # migration/postcopy-ram.c
 postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
 postcopy_ram_discard_range(void *start, size_t length) "%p,+%zx"
+postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
 
 # kvm-all.c
 kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"