diff mbox

[v5,17/45] Add wrappers and handlers for sending/receiving the postcopy-ram migration messages.

Message ID 1424883128-9841-18-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert Feb. 25, 2015, 4:51 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

The state of the postcopy process is managed via a series of messages;
   * Add wrappers and handlers for sending/receiving these messages
   * Add state variable that track the current state of postcopy

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |  15 ++
 include/sysemu/sysemu.h       |  23 +++
 migration/migration.c         |  13 ++
 savevm.c                      | 325 ++++++++++++++++++++++++++++++++++++++++++
 trace-events                  |  11 ++
 5 files changed, 387 insertions(+)

Comments

David Gibson March 12, 2015, 9:30 a.m. UTC | #1
On Wed, Feb 25, 2015 at 04:51:40PM +0000, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> The state of the postcopy process is managed via a series of messages;
>    * Add wrappers and handlers for sending/receiving these messages
>    * Add state variable that track the current state of postcopy
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  include/migration/migration.h |  15 ++
>  include/sysemu/sysemu.h       |  23 +++
>  migration/migration.c         |  13 ++
>  savevm.c                      | 325 ++++++++++++++++++++++++++++++++++++++++++
>  trace-events                  |  11 ++
>  5 files changed, 387 insertions(+)
> 
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index f94af5b..81cd1f2 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -52,6 +52,14 @@ typedef struct MigrationState MigrationState;
>  
>  typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
>  
> +typedef enum {
> +    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
> +    POSTCOPY_INCOMING_ADVISE,
> +    POSTCOPY_INCOMING_LISTENING,
> +    POSTCOPY_INCOMING_RUNNING,
> +    POSTCOPY_INCOMING_END
> +} PostcopyState;
> +
>  /* State for the incoming migration */
>  struct MigrationIncomingState {
>      QEMUFile *file;
> @@ -59,6 +67,8 @@ struct MigrationIncomingState {
>      /* See savevm.c */
>      LoadStateEntry_Head loadvm_handlers;
>  
> +    PostcopyState postcopy_state;
> +
>      QEMUFile *return_path;
>      QemuMutex      rp_mutex;    /* We send replies from multiple threads */
>  };
> @@ -219,4 +229,9 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
>                               ram_addr_t offset, size_t size,
>                               int *bytes_sent);
>  
> +PostcopyState postcopy_state_get(MigrationIncomingState *mis);
> +
> +/* Set the state and return the old state */
> +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> +                                 PostcopyState new_state);
>  #endif
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 8da879f..d6a6d51 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -87,6 +87,18 @@ enum qemu_vm_cmd {
>      MIG_CMD_INVALID = 0,       /* Must be 0 */
>      MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
>      MIG_CMD_PING,              /* Request a PONG on the RP */
> +
> +    MIG_CMD_POSTCOPY_ADVISE = 20,  /* Prior to any page transfers, just
> +                                      warn we might want to do PC */
> +    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
> +                                      pages as it's running. */
> +    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
> +    MIG_CMD_POSTCOPY_END,          /* Postcopy is finished. */
> +
> +    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
> +                                      were previously sent during
> +                                      precopy but are dirty. */
> +
>  };
>  
>  bool qemu_savevm_state_blocked(Error **errp);
> @@ -101,6 +113,17 @@ void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
>                                uint16_t len, uint8_t *data);
>  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
>  void qemu_savevm_send_open_return_path(QEMUFile *f);
> +void qemu_savevm_send_postcopy_advise(QEMUFile *f);
> +void qemu_savevm_send_postcopy_listen(QEMUFile *f);
> +void qemu_savevm_send_postcopy_run(QEMUFile *f);
> +void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status);
> +
> +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
> +                                           uint16_t len, uint8_t offset,
> +                                           uint64_t *addrlist,
> +                                           uint32_t *masklist);
> +
> +
>  int qemu_loadvm_state(QEMUFile *f);
>  
>  /* SLIRP */
> diff --git a/migration/migration.c b/migration/migration.c
> index 434864a..957115a 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -971,3 +971,16 @@ void migrate_fd_connect(MigrationState *s)
>      qemu_thread_create(&s->thread, "migration", migration_thread, s,
>                         QEMU_THREAD_JOINABLE);
>  }
> +
> +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> +{
> +    return atomic_fetch_add(&mis->postcopy_state, 0);
> +}
> +
> +/* Set the state and return the old state */
> +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> +                                 PostcopyState new_state)
> +{
> +    return atomic_xchg(&mis->postcopy_state, new_state);

Is there anything explaining what the overall atomicity requirements
are for this state variable?  It's a bit hard to tell if an atomic
xchg is necessary or sufficient without a description of what the
overall concurrency scheme is with regards to this variable.

> +}
> +
> diff --git a/savevm.c b/savevm.c
> index 4b619da..e31ccb0 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -39,6 +39,7 @@
>  #include "exec/memory.h"
>  #include "qmp-commands.h"
>  #include "trace.h"
> +#include "qemu/bitops.h"
>  #include "qemu/iov.h"
>  #include "block/snapshot.h"
>  #include "block/qapi.h"
> @@ -635,6 +636,90 @@ void qemu_savevm_send_open_return_path(QEMUFile *f)
>      qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
>  }
>  
> +/* Send prior to any postcopy transfer */
> +void qemu_savevm_send_postcopy_advise(QEMUFile *f)
> +{
> +    uint64_t tmp[2];
> +    tmp[0] = cpu_to_be64(getpagesize());
> +    tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());

I wonder if using a structure for the tmp buffer might be an odea, as
a form of documentation of the data expected in the ADVISE command.

> +    trace_qemu_savevm_send_postcopy_advise();
> +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
> +}
> +
> +/* Prior to running, to cause pages that have been dirtied after precopy
> + * started to be discarded on the destination.
> + * CMD_POSTCOPY_RAM_DISCARD consist of:
> + *  3 byte header (filled in by qemu_savevm_send_postcopy_ram_discard)
> + *      byte   version (0)
> + *      byte   offset to be subtracted from each page address to deal with
> + *             RAMBlocks that don't start on a mask word boundary.

I think this needs more explanation.  Why can't this be folded into
the page addresses sent over the wire?

> + *      byte   Length of name field
> + *  n x byte   RAM block name (NOT 0 terminated)

I think \0 terminating would probably be safer, even if it's
technically redundant.

> + *  n x
> + *      be64   Page addresses for start of an invalidation range
> + *      be32   mask of 32 pages, '1' to discard'

Is the extra compactness from this semi-sparse bitmap encoding
actually worth it?  A simple list of page addresses, or address ranges
to discard would be substantially simpler to get one's head around,
and also seems like it might be more robust against future
implementation changes as a wire format.

> + *  Hopefully this is pretty sparse so we don't get too many entries,
> + *  and using the mask should deal with most pagesize differences
> + *  just ending up as a single full mask
> + *
> + * The mask is always 32bits irrespective of the long size
> + *
> + *  name:  RAMBlock name that these entries are part of
> + *  len: Number of page entries
> + *  addrlist: 'len' addresses
> + *  masklist: 'len' masks (corresponding to the addresses)
> + */
> +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
> +                                           uint16_t len, uint8_t offset,
> +                                           uint64_t *addrlist,
> +                                           uint32_t *masklist)
> +{
> +    uint8_t *buf;
> +    uint16_t tmplen;
> +    uint16_t t;
> +
> +    trace_qemu_savevm_send_postcopy_ram_discard();
> +    buf = g_malloc0(len*12 + strlen(name) + 3);
> +    buf[0] = 0; /* Version */
> +    buf[1] = offset;
> +    assert(strlen(name) < 256);
> +    buf[2] = strlen(name);
> +    memcpy(buf+3, name, strlen(name));
> +    tmplen = 3+strlen(name);

Repeated calls to strlen() always seem icky to me, although I guess
it's all gcc builtins here, so they are probably optimized out by
CSE.

> +    for (t = 0; t < len; t++) {
> +        cpu_to_be64w((uint64_t *)(buf + tmplen), addrlist[t]);
> +        tmplen += 8;
> +        cpu_to_be32w((uint32_t *)(buf + tmplen), masklist[t]);
> +        tmplen += 4;
> +    }
> +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
> +    g_free(buf);
> +}
> +
> +/* Get the destination into a state where it can receive postcopy data. */
> +void qemu_savevm_send_postcopy_listen(QEMUFile *f)
> +{
> +    trace_savevm_send_postcopy_listen();
> +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
> +}
> +
> +/* Kick the destination into running */
> +void qemu_savevm_send_postcopy_run(QEMUFile *f)
> +{
> +    trace_savevm_send_postcopy_run();
> +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
> +}

DISCARD will typically immediately precede LISTEN, won't it?  Is there
a reason not to put the discard data into the LISTEN command?

> +
> +/* End of postcopy - with a status byte; 0 is good, anything else is a fail */
> +void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status)
> +{
> +    trace_savevm_send_postcopy_end();
> +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_END, 1, &status);
> +}

What's the distinction between the postcopy END command and the normal
end of the migration stream?  Is there already a way to detect the end
of stream normally?

>  bool qemu_savevm_state_blocked(Error **errp)
>  {
>      SaveStateEntry *se;
> @@ -961,6 +1046,212 @@ enum LoadVMExitCodes {
>  
>  static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
>  
> +/* ------ incoming postcopy messages ------ */
> +/* 'advise' arrives before any transfers just to tell us that a postcopy
> + * *might* happen - it might be skipped if precopy transferred everything
> + * quickly.
> + */
> +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
> +                                         uint64_t remote_hps,
> +                                         uint64_t remote_tps)
> +{
> +    PostcopyState ps = postcopy_state_get(mis);
> +    trace_loadvm_postcopy_handle_advise();
> +    if (ps != POSTCOPY_INCOMING_NONE) {
> +        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
> +        return -1;
> +    }
> +
> +    if (remote_hps != getpagesize())  {
> +        /*
> +         * Some combinations of mismatch are probably possible but it gets
> +         * a bit more complicated.  In particular we need to place whole
> +         * host pages on the dest at once, and we need to ensure that we
> +         * handle dirtying to make sure we never end up sending part of
> +         * a hostpage on it's own.
> +         */
> +        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
> +                     (int)remote_hps, getpagesize());
> +        return -1;
> +    }
> +
> +    if (remote_tps != (1ul << qemu_target_page_bits())) {
> +        /*
> +         * Again, some differences could be dealt with, but for now keep it
> +         * simple.
> +         */
> +        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
> +                     (int)remote_tps, 1 << qemu_target_page_bits());
> +        return -1;
> +    }
> +
> +    postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);

Should you be checking the return value here to make sure it's still
POSTCOPY_INCOMING_NONE?  Atomic xchgs seem overkill if you still have
a race between the fetch at the top and the set here.

Or, in fact, should you be just doing an atomic exchange-then-check at
the top, rather than checking at the top, then changing at the bottom.

> +    return 0;
> +}
> +
> +/* After postcopy we will be told to throw some pages away since they're
> + * dirty and will have to be demand fetched.  Must happen before CPU is
> + * started.
> + * There can be 0..many of these messages, each encoding multiple pages.
> + * Bits set in the message represent a page in the source VMs bitmap, but
> + * since the guest/target page sizes can be different on s/d then we have
> + * to convert.

Uh.. I thought the checks in the ADVISE processing eliminated that possibility.

> + */
> +static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> +                                              uint16_t len)
> +{
> +    int tmp;
> +    unsigned int first_bit_offset;
> +    char ramid[256];
> +    PostcopyState ps = postcopy_state_get(mis);
> +
> +    trace_loadvm_postcopy_ram_handle_discard();
> +
> +    if (ps != POSTCOPY_INCOMING_ADVISE) {

Could you theoretically also get these in LISTEN state?  I realise the
current implementation doesn't do that.

> +        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
> +                     ps);
> +        return -1;
> +    }
> +    /* We're expecting a
> +     *    3 byte header,
> +     *    a RAM ID string
> +     *    then at least 1 12 byte chunks
> +    */
> +    if (len < 16) {
> +        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
> +        return -1;
> +    }
> +
> +    tmp = qemu_get_byte(mis->file);
> +    if (tmp != 0) {
> +        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
> +        return -1;
> +    }
> +    first_bit_offset = qemu_get_byte(mis->file);
> +
> +    if (qemu_get_counted_string(mis->file, ramid)) {
> +        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
> +        return -1;
> +    }
> +
> +    len -= 3+strlen(ramid);
> +    if (len % 12) {
> +        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
> +        return -1;
> +    }
> +    while (len) {
> +        uint64_t startaddr;
> +        uint32_t mask;
> +        /*
> +         * We now have pairs of address, mask
> +         *   Each word of mask is 32 bits, where each bit corresponds to one
> +         *   target page.
> +         *   RAMBlocks don't necessarily start on word boundaries,
> +         *   and the offset in the header indicates the offset into the 1st
> +         *   mask word that corresponds to the 1st page of the RAMBlock.
> +         */
> +        startaddr = qemu_get_be64(mis->file);
> +        mask = qemu_get_be32(mis->file);
> +
> +        len -= 12;
> +
> +        while (mask) {
> +            /* mask= .....?10...0 */
> +            /*             ^fs    */
> +            int firstset = ctz32(mask);
> +
> +            /* tmp32=.....?11...1 */
> +            /*             ^fs    */
> +            uint32_t tmp32 = mask | ((((uint32_t)1)<<firstset)-1);
> +
> +            /* mask= .?01..10...0 */
> +            /*         ^fz ^fs    */
> +            int firstzero = cto32(tmp32);
> +
> +            if ((startaddr == 0) && (firstset < first_bit_offset)) {
> +                error_report("CMD_POSTCOPY_RAM_DISCARD bad data; bit set"
> +                               " prior to block; block=%s offset=%d"
> +                               " firstset=%d\n", ramid, first_bit_offset,
> +                               firstzero);
> +                return -1;
> +            }
> +
> +            /*
> +             * we know there must be at least 1 bit set due to the loop entry
> +             * If there is no 0 firstzero will be 32
> +             */
> +            /* TODO - ram_discard_range gets added in a later patch
> +            int ret = ram_discard_range(mis, ramid,
> +                                startaddr + firstset - first_bit_offset,
> +                                startaddr + (firstzero - 1) - first_bit_offset);
> +            ret = -1;
> +            if (ret) {
> +                return ret;
> +            }
> +            */
> +
> +            /* mask= .?0000000000 */
> +            /*         ^fz ^fs    */
> +            if (firstzero != 32) {
> +                mask &= (((uint32_t)-1) << firstzero);
> +            } else {
> +                mask = 0;
> +            }
> +        }

Ugh.  Again I ask, are you really sure the semi-sparse wire
representation is worth all this hassle?

> +    }
> +    trace_loadvm_postcopy_ram_handle_discard_end();
> +
> +    return 0;
> +}
> +
> +/* After this message we must be able to immediately receive postcopy data */

This doesn't quite make sense to me.  AFAICT, this is executed on the
destination side, so this isn't so much a command to start listening
as an assertion that the source side is already listening and ready to
receive page requests on the return path.

> +static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> +{
> +    PostcopyState ps = postcopy_state_set(mis, POSTCOPY_INCOMING_LISTENING);
> +    trace_loadvm_postcopy_handle_listen();
> +    if (ps != POSTCOPY_INCOMING_ADVISE) {
> +        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
> +        return -1;
> +    }
> +
> +    /* TODO start up the postcopy listening thread */
> +    return 0;
> +}
> +
> +/* After all discards we can start running and asking for pages */
> +static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
> +{
> +    PostcopyState ps = postcopy_state_set(mis, POSTCOPY_INCOMING_RUNNING);
> +    trace_loadvm_postcopy_handle_run();
> +    if (ps != POSTCOPY_INCOMING_LISTENING) {
> +        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
> +        return -1;
> +    }
> +
> +    if (autostart) {
> +        /* Hold onto your hats, starting the CPU */
> +        vm_start();
> +    } else {
> +        /* leave it paused and let management decide when to start the CPU */
> +        runstate_set(RUN_STATE_PAUSED);
> +    }
> +
> +    return 0;
> +}
> +
> +/* The end - with a byte from the source which can tell us to fail. */
> +static int loadvm_postcopy_handle_end(MigrationIncomingState *mis)
> +{
> +    PostcopyState ps = postcopy_state_get(mis);
> +    trace_loadvm_postcopy_handle_end();
> +    if (ps == POSTCOPY_INCOMING_NONE) {
> +        error_report("CMD_POSTCOPY_END in wrong postcopy state (%d)", ps);
> +        return -1;
> +    }
> +    return -1; /* TODO - expecting 1 byte good/fail */
> +}
> +
>  static int loadvm_process_command_simple_lencheck(const char *name,
>                                                    unsigned int actual,
>                                                    unsigned int expected)
> @@ -986,6 +1277,7 @@ static int loadvm_process_command(QEMUFile *f)
>      uint16_t com;
>      uint16_t len;
>      uint32_t tmp32;
> +    uint64_t tmp64a, tmp64b;
>  
>      com = qemu_get_be16(f);
>      len = qemu_get_be16(f);
> @@ -1023,6 +1315,39 @@ static int loadvm_process_command(QEMUFile *f)
>          migrate_send_rp_pong(mis, tmp32);
>          break;
>  
> +    case MIG_CMD_POSTCOPY_ADVISE:
> +        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_ADVISE",
> +                                                   len, 16)) {
> +            return -1;
> +        }
> +        tmp64a = qemu_get_be64(f); /* hps */
> +        tmp64b = qemu_get_be64(f); /* tps */
> +        return loadvm_postcopy_handle_advise(mis, tmp64a, tmp64b);
> +
> +    case MIG_CMD_POSTCOPY_LISTEN:
> +        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_LISTEN",
> +                                                   len, 0)) {
> +            return -1;
> +        }
> +        return loadvm_postcopy_handle_listen(mis);
> +
> +    case MIG_CMD_POSTCOPY_RUN:
> +        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_RUN",
> +                                                   len, 0)) {
> +            return -1;
> +        }
> +        return loadvm_postcopy_handle_run(mis);
> +
> +    case MIG_CMD_POSTCOPY_END:
> +        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_END",
> +                                                   len, 1)) {
> +            return -1;
> +        }
> +        return loadvm_postcopy_handle_end(mis);
> +
> +    case MIG_CMD_POSTCOPY_RAM_DISCARD:
> +        return loadvm_postcopy_ram_handle_discard(mis, len);
> +
>      default:
>          error_report("VM_COMMAND 0x%x unknown (len 0x%x)", com, len);
>          return -1;
> diff --git a/trace-events b/trace-events
> index 4ff55fe..050f553 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1171,11 +1171,22 @@ qemu_loadvm_state_main(void) ""
>  qemu_loadvm_state_main_quit_parent(void) ""
>  qemu_loadvm_state_post_main(int ret) "%d"
>  qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
> +loadvm_postcopy_handle_advise(void) ""
> +loadvm_postcopy_handle_end(void) ""
> +loadvm_postcopy_handle_listen(void) ""
> +loadvm_postcopy_handle_run(void) ""
> +loadvm_postcopy_ram_handle_discard(void) ""
> +loadvm_postcopy_ram_handle_discard_end(void) ""
>  loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
>  loadvm_process_command_ping(uint32_t val) "%x"
> +qemu_savevm_send_postcopy_advise(void) ""
> +qemu_savevm_send_postcopy_ram_discard(void) ""
>  savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
>  savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
>  savevm_send_ping(uint32_t val) "%x"
> +savevm_send_postcopy_end(void) ""
> +savevm_send_postcopy_listen(void) ""
> +savevm_send_postcopy_run(void) ""
>  savevm_state_begin(void) ""
>  savevm_state_header(void) ""
>  savevm_state_iterate(void) ""
Dr. David Alan Gilbert March 26, 2015, 4:33 p.m. UTC | #2
(Only replying to some of the items in this mail - the others I'll get
to another time).

* David Gibson (david@gibson.dropbear.id.au) wrote:
> On Wed, Feb 25, 2015 at 04:51:40PM +0000, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > The state of the postcopy process is managed via a series of messages;
> >    * Add wrappers and handlers for sending/receiving these messages
> >    * Add state variable that track the current state of postcopy
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  include/migration/migration.h |  15 ++
> >  include/sysemu/sysemu.h       |  23 +++
> >  migration/migration.c         |  13 ++
> >  savevm.c                      | 325 ++++++++++++++++++++++++++++++++++++++++++
> >  trace-events                  |  11 ++
> >  5 files changed, 387 insertions(+)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index f94af5b..81cd1f2 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -52,6 +52,14 @@ typedef struct MigrationState MigrationState;
> >  
> >  typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
> >  
> > +typedef enum {
> > +    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
> > +    POSTCOPY_INCOMING_ADVISE,
> > +    POSTCOPY_INCOMING_LISTENING,
> > +    POSTCOPY_INCOMING_RUNNING,
> > +    POSTCOPY_INCOMING_END
> > +} PostcopyState;
> > +
> >  /* State for the incoming migration */
> >  struct MigrationIncomingState {
> >      QEMUFile *file;
> > @@ -59,6 +67,8 @@ struct MigrationIncomingState {
> >      /* See savevm.c */
> >      LoadStateEntry_Head loadvm_handlers;
> >  
> > +    PostcopyState postcopy_state;
> > +
> >      QEMUFile *return_path;
> >      QemuMutex      rp_mutex;    /* We send replies from multiple threads */
> >  };
> > @@ -219,4 +229,9 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
> >                               ram_addr_t offset, size_t size,
> >                               int *bytes_sent);
> >  
> > +PostcopyState postcopy_state_get(MigrationIncomingState *mis);
> > +
> > +/* Set the state and return the old state */
> > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > +                                 PostcopyState new_state);
> >  #endif
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index 8da879f..d6a6d51 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -87,6 +87,18 @@ enum qemu_vm_cmd {
> >      MIG_CMD_INVALID = 0,       /* Must be 0 */
> >      MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
> >      MIG_CMD_PING,              /* Request a PONG on the RP */
> > +
> > +    MIG_CMD_POSTCOPY_ADVISE = 20,  /* Prior to any page transfers, just
> > +                                      warn we might want to do PC */
> > +    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
> > +                                      pages as it's running. */
> > +    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
> > +    MIG_CMD_POSTCOPY_END,          /* Postcopy is finished. */
> > +
> > +    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
> > +                                      were previously sent during
> > +                                      precopy but are dirty. */
> > +
> >  };
> >  
> >  bool qemu_savevm_state_blocked(Error **errp);
> > @@ -101,6 +113,17 @@ void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
> >                                uint16_t len, uint8_t *data);
> >  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
> >  void qemu_savevm_send_open_return_path(QEMUFile *f);
> > +void qemu_savevm_send_postcopy_advise(QEMUFile *f);
> > +void qemu_savevm_send_postcopy_listen(QEMUFile *f);
> > +void qemu_savevm_send_postcopy_run(QEMUFile *f);
> > +void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status);
> > +
> > +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
> > +                                           uint16_t len, uint8_t offset,
> > +                                           uint64_t *addrlist,
> > +                                           uint32_t *masklist);
> > +
> > +
> >  int qemu_loadvm_state(QEMUFile *f);
> >  
> >  /* SLIRP */
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 434864a..957115a 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -971,3 +971,16 @@ void migrate_fd_connect(MigrationState *s)
> >      qemu_thread_create(&s->thread, "migration", migration_thread, s,
> >                         QEMU_THREAD_JOINABLE);
> >  }
> > +
> > +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> > +{
> > +    return atomic_fetch_add(&mis->postcopy_state, 0);
> > +}
> > +
> > +/* Set the state and return the old state */
> > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > +                                 PostcopyState new_state)
> > +{
> > +    return atomic_xchg(&mis->postcopy_state, new_state);
> 
> Is there anything explaining what the overall atomicity requirements
> are for this state variable?  It's a bit hard to tell if an atomic
> xchg is necessary or sufficient without a description of what the
> overall concurrency scheme is with regards to this variable.

Can you tell me how to define the requirements?
It's a state variable tested and changed by at least two threads and
it's got to go through a correct sequence of states.
So generally you're doing a 'I expect to be in .... now change to ....'
so the exchange works well for that.

> > + *  n x
> > + *      be64   Page addresses for start of an invalidation range
> > + *      be32   mask of 32 pages, '1' to discard'
> 
> Is the extra compactness from this semi-sparse bitmap encoding
> actually worth it?  A simple list of page addresses, or address ranges
> to discard would be substantially simpler to get one's head around,
> and also seems like it might be more robust against future
> implementation changes as a wire format.

As previously discussed I really think it is;  what I'm tending to
see when I've been looking at these in debug is something that's
sparse but tends to be blobby with sets of pages discarded near
by.  However you do this you're going to have to walk this
bitmap and format out some sort of set of messages.

> > + *  Hopefully this is pretty sparse so we don't get too many entries,
> > + *  and using the mask should deal with most pagesize differences
> > + *  just ending up as a single full mask
> > + *
> > + * The mask is always 32bits irrespective of the long size
> > + *
> > + *  name:  RAMBlock name that these entries are part of
> > + *  len: Number of page entries
> > + *  addrlist: 'len' addresses
> > + *  masklist: 'len' masks (corresponding to the addresses)
> > + */
> > +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
> > +                                           uint16_t len, uint8_t offset,
> > +                                           uint64_t *addrlist,
> > +                                           uint32_t *masklist)
> > +{
> > +    uint8_t *buf;
> > +    uint16_t tmplen;
> > +    uint16_t t;
> > +
> > +    trace_qemu_savevm_send_postcopy_ram_discard();
> > +    buf = g_malloc0(len*12 + strlen(name) + 3);
> > +    buf[0] = 0; /* Version */
> > +    buf[1] = offset;
> > +    assert(strlen(name) < 256);
> > +    buf[2] = strlen(name);
> > +    memcpy(buf+3, name, strlen(name));
> > +    tmplen = 3+strlen(name);
> 
> Repeated calls to strlen() always seem icky to me, although I guess
> it's all gcc builtins here, so they are probably optimized out by
> CSE.

Yeh, those are now gone.  Thanks.

> > +    for (t = 0; t < len; t++) {
> > +        cpu_to_be64w((uint64_t *)(buf + tmplen), addrlist[t]);
> > +        tmplen += 8;
> > +        cpu_to_be32w((uint32_t *)(buf + tmplen), masklist[t]);
> > +        tmplen += 4;
> > +    }
> > +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
> > +    g_free(buf);
> > +}
> > +
> > +/* Get the destination into a state where it can receive postcopy data. */
> > +void qemu_savevm_send_postcopy_listen(QEMUFile *f)
> > +{
> > +    trace_savevm_send_postcopy_listen();
> > +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
> > +}
> > +
> > +/* Kick the destination into running */
> > +void qemu_savevm_send_postcopy_run(QEMUFile *f)
> > +{
> > +    trace_savevm_send_postcopy_run();
> > +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
> > +}
> 
> DISCARD will typically immediately precede LISTEN, won't it?  Is there
> a reason not to put the discard data into the LISTEN command?

Discard data can be quite large, so I potentially send multiple discard
commands.
(Also as you can tell generally I've got a preference for one message does one
thing, and thus I have tried to keep them separate).

> > +
> > +/* End of postcopy - with a status byte; 0 is good, anything else is a fail */
> > +void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status)
> > +{
> > +    trace_savevm_send_postcopy_end();
> > +    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_END, 1, &status);
> > +}
> 
> What's the distinction between the postcopy END command and the normal
> end of the migration stream?  Is there already a way to detect the end
> of stream normally?

OK, thanks I've killed that off.
The short answer is that the 'end of migration stream' is just a terminating
byte and I was hoping to put something better on there with a status
from the source side;  but that fix can be a general fix and doesn't
need to be postcopy.

> 
> >  bool qemu_savevm_state_blocked(Error **errp)
> >  {
> >      SaveStateEntry *se;
> > @@ -961,6 +1046,212 @@ enum LoadVMExitCodes {
> >  
> >  static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
> >  
> > +/* ------ incoming postcopy messages ------ */
> > +/* 'advise' arrives before any transfers just to tell us that a postcopy
> > + * *might* happen - it might be skipped if precopy transferred everything
> > + * quickly.
> > + */
> > +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
> > +                                         uint64_t remote_hps,
> > +                                         uint64_t remote_tps)
> > +{
> > +    PostcopyState ps = postcopy_state_get(mis);
> > +    trace_loadvm_postcopy_handle_advise();
> > +    if (ps != POSTCOPY_INCOMING_NONE) {
> > +        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
> > +        return -1;
> > +    }
> > +
> > +    if (remote_hps != getpagesize())  {
> > +        /*
> > +         * Some combinations of mismatch are probably possible but it gets
> > +         * a bit more complicated.  In particular we need to place whole
> > +         * host pages on the dest at once, and we need to ensure that we
> > +         * handle dirtying to make sure we never end up sending part of
> > +         * a hostpage on it's own.
> > +         */
> > +        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
> > +                     (int)remote_hps, getpagesize());
> > +        return -1;
> > +    }
> > +
> > +    if (remote_tps != (1ul << qemu_target_page_bits())) {
> > +        /*
> > +         * Again, some differences could be dealt with, but for now keep it
> > +         * simple.
> > +         */
> > +        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
> > +                     (int)remote_tps, 1 << qemu_target_page_bits());
> > +        return -1;
> > +    }
> > +
> > +    postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);
> 
> Should you be checking the return value here to make sure it's still
> POSTCOPY_INCOMING_NONE?  Atomic xchgs seem overkill if you still have
> a race between the fetch at the top and the set here.
> 
> Or, in fact, should you be just doing an atomic exchange-then-check at
> the top, rather than checking at the top, then changing at the bottom.

There's no race at this point yet; going from None->advise we still only
have one thread.  The check at the top is a check against protocol
violations (e.g. getting two advise or something like that).

> > +    return 0;
> > +}
> > +
> > +/* After postcopy we will be told to throw some pages away since they're
> > + * dirty and will have to be demand fetched.  Must happen before CPU is
> > + * started.
> > + * There can be 0..many of these messages, each encoding multiple pages.
> > + * Bits set in the message represent a page in the source VMs bitmap, but
> > + * since the guest/target page sizes can be different on s/d then we have
> > + * to convert.
> 
> Uh.. I thought the checks in the ADVISE processing eliminated that possibility.

Old message; I was originally trying to keep it more general, but ripped
the conversions out when it became just too messy for now.
Gone.


Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
David Gibson March 27, 2015, 4:13 a.m. UTC | #3
On Thu, Mar 26, 2015 at 04:33:28PM +0000, Dr. David Alan Gilbert wrote:
> (Only replying to some of the items in this mail - the others I'll get
> to another time).
> 
> * David Gibson (david@gibson.dropbear.id.au) wrote:
> > On Wed, Feb 25, 2015 at 04:51:40PM +0000, Dr. David Alan Gilbert (git) wrote:
> > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > > 
> > > The state of the postcopy process is managed via a series of messages;
> > >    * Add wrappers and handlers for sending/receiving these messages
> > >    * Add state variable that track the current state of postcopy
> > > 
> > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > ---
> > >  include/migration/migration.h |  15 ++
> > >  include/sysemu/sysemu.h       |  23 +++
> > >  migration/migration.c         |  13 ++
> > >  savevm.c                      | 325 ++++++++++++++++++++++++++++++++++++++++++
> > >  trace-events                  |  11 ++
> > >  5 files changed, 387 insertions(+)
> > > 
> > > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > > index f94af5b..81cd1f2 100644
> > > --- a/include/migration/migration.h
> > > +++ b/include/migration/migration.h
> > > @@ -52,6 +52,14 @@ typedef struct MigrationState MigrationState;
> > >  
> > >  typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
> > >  
> > > +typedef enum {
> > > +    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
> > > +    POSTCOPY_INCOMING_ADVISE,
> > > +    POSTCOPY_INCOMING_LISTENING,
> > > +    POSTCOPY_INCOMING_RUNNING,
> > > +    POSTCOPY_INCOMING_END
> > > +} PostcopyState;
> > > +
> > >  /* State for the incoming migration */
> > >  struct MigrationIncomingState {
> > >      QEMUFile *file;
> > > @@ -59,6 +67,8 @@ struct MigrationIncomingState {
> > >      /* See savevm.c */
> > >      LoadStateEntry_Head loadvm_handlers;
> > >  
> > > +    PostcopyState postcopy_state;
> > > +
> > >      QEMUFile *return_path;
> > >      QemuMutex      rp_mutex;    /* We send replies from multiple threads */
> > >  };
> > > @@ -219,4 +229,9 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
> > >                               ram_addr_t offset, size_t size,
> > >                               int *bytes_sent);
> > >  
> > > +PostcopyState postcopy_state_get(MigrationIncomingState *mis);
> > > +
> > > +/* Set the state and return the old state */
> > > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > > +                                 PostcopyState new_state);
> > >  #endif
> > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > index 8da879f..d6a6d51 100644
> > > --- a/include/sysemu/sysemu.h
> > > +++ b/include/sysemu/sysemu.h
> > > @@ -87,6 +87,18 @@ enum qemu_vm_cmd {
> > >      MIG_CMD_INVALID = 0,       /* Must be 0 */
> > >      MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
> > >      MIG_CMD_PING,              /* Request a PONG on the RP */
> > > +
> > > +    MIG_CMD_POSTCOPY_ADVISE = 20,  /* Prior to any page transfers, just
> > > +                                      warn we might want to do PC */
> > > +    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
> > > +                                      pages as it's running. */
> > > +    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
> > > +    MIG_CMD_POSTCOPY_END,          /* Postcopy is finished. */
> > > +
> > > +    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
> > > +                                      were previously sent during
> > > +                                      precopy but are dirty. */
> > > +
> > >  };
> > >  
> > >  bool qemu_savevm_state_blocked(Error **errp);
> > > @@ -101,6 +113,17 @@ void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
> > >                                uint16_t len, uint8_t *data);
> > >  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
> > >  void qemu_savevm_send_open_return_path(QEMUFile *f);
> > > +void qemu_savevm_send_postcopy_advise(QEMUFile *f);
> > > +void qemu_savevm_send_postcopy_listen(QEMUFile *f);
> > > +void qemu_savevm_send_postcopy_run(QEMUFile *f);
> > > +void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status);
> > > +
> > > +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
> > > +                                           uint16_t len, uint8_t offset,
> > > +                                           uint64_t *addrlist,
> > > +                                           uint32_t *masklist);
> > > +
> > > +
> > >  int qemu_loadvm_state(QEMUFile *f);
> > >  
> > >  /* SLIRP */
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index 434864a..957115a 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -971,3 +971,16 @@ void migrate_fd_connect(MigrationState *s)
> > >      qemu_thread_create(&s->thread, "migration", migration_thread, s,
> > >                         QEMU_THREAD_JOINABLE);
> > >  }
> > > +
> > > +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> > > +{
> > > +    return atomic_fetch_add(&mis->postcopy_state, 0);
> > > +}
> > > +
> > > +/* Set the state and return the old state */
> > > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > > +                                 PostcopyState new_state)
> > > +{
> > > +    return atomic_xchg(&mis->postcopy_state, new_state);
> > 
> > Is there anything explaining what the overall atomicity requirements
> > are for this state variable?  It's a bit hard to tell if an atomic
> > xchg is necessary or sufficient without a description of what the
> > overall concurrency scheme is with regards to this variable.
> 
> Can you tell me how to define the requirements?

Well, that is always the tricky question.

> It's a state variable tested and changed by at least two threads and
> it's got to go through a correct sequence of states.
> So generally you're doing a 'I expect to be in .... now change to ....'
> so the exchange works well for that.

So.. in this case.  It seems what might make sense as a basic
atomicity option here is a cmpxchg.  So your state_set function takes
old_state and new_state.  It will atomically update old_state to
new_state, returning an error if somebody else changed the state away
from old_state in the meantime.

Then you can have a blurb next to the state_set helper explaining why
you need this atomic op for updates to the state variable from
multiple threads.

> > > + *  n x
> > > + *      be64   Page addresses for start of an invalidation range
> > > + *      be32   mask of 32 pages, '1' to discard'
> > 
> > Is the extra compactness from this semi-sparse bitmap encoding
> > actually worth it?  A simple list of page addresses, or address ranges
> > to discard would be substantially simpler to get one's head around,
> > and also seems like it might be more robust against future
> > implementation changes as a wire format.
> 
> As previously discussed I really think it is;  what I'm tending to
> see when I've been looking at these in debug is something that's
> sparse but tends to be blobby with sets of pages discarded near
> by.  However you do this you're going to have to walk this
> bitmap and format out some sort of set of messages.

I'd really need to see some numbers to be convinced.  The semi-sparse
bitmap introduces a huge amount of code and complexity at both ends.

Remember, using ranges, you can still coalesce adjacent discarded
pages.  Even using some compat differential encodings of the range
endpoints might work out simpler than the bitmap fragments.

Plus, ranges handle the host versus target page size distinctions very
naturally.

[snip]
> > DISCARD will typically immediately precede LISTEN, won't it?  Is there
> > a reason not to put the discard data into the LISTEN command?
> 
> Discard data can be quite large, so I potentially send multiple discard
> commands.
> (Also as you can tell generally I've got a preference for one message does one
> thing, and thus I have tried to keep them separate).

So, I like the idea of one message per action in principle - but only
if that action really is well-defined without reference to what
operations come before and after it.  If there are hidden dependencies
about what actions have to come in what order, I'd rather bake that
into the command structure.

> > > +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
> > > +                                         uint64_t remote_hps,
> > > +                                         uint64_t remote_tps)
> > > +{
> > > +    PostcopyState ps = postcopy_state_get(mis);
> > > +    trace_loadvm_postcopy_handle_advise();
> > > +    if (ps != POSTCOPY_INCOMING_NONE) {
> > > +        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (remote_hps != getpagesize())  {
> > > +        /*
> > > +         * Some combinations of mismatch are probably possible but it gets
> > > +         * a bit more complicated.  In particular we need to place whole
> > > +         * host pages on the dest at once, and we need to ensure that we
> > > +         * handle dirtying to make sure we never end up sending part of
> > > +         * a hostpage on it's own.
> > > +         */
> > > +        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
> > > +                     (int)remote_hps, getpagesize());
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (remote_tps != (1ul << qemu_target_page_bits())) {
> > > +        /*
> > > +         * Again, some differences could be dealt with, but for now keep it
> > > +         * simple.
> > > +         */
> > > +        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
> > > +                     (int)remote_tps, 1 << qemu_target_page_bits());
> > > +        return -1;
> > > +    }
> > > +
> > > +    postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);
> > 
> > Should you be checking the return value here to make sure it's still
> > POSTCOPY_INCOMING_NONE?  Atomic xchgs seem overkill if you still have
> > a race between the fetch at the top and the set here.
> > 
> > Or, in fact, should you be just doing an atomic exchange-then-check at
> > the top, rather than checking at the top, then changing at the bottom.
> 
> There's no race at this point yet; going from None->advise we still only
> have one thread.  The check at the top is a check against protocol
> violations (e.g. getting two advise or something like that).

Yeah.. and having to have intimate knowledge of the thread structure
at each point in order to analyze the atomic operation correctness is
exactly what bothers me about it.
Dr. David Alan Gilbert March 27, 2015, 10:48 a.m. UTC | #4
* David Gibson (david@gibson.dropbear.id.au) wrote:
> On Thu, Mar 26, 2015 at 04:33:28PM +0000, Dr. David Alan Gilbert wrote:
> > (Only replying to some of the items in this mail - the others I'll get
> > to another time).
> > 
> > * David Gibson (david@gibson.dropbear.id.au) wrote:
> > > On Wed, Feb 25, 2015 at 04:51:40PM +0000, Dr. David Alan Gilbert (git) wrote:
> > > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > > > 
> > > > The state of the postcopy process is managed via a series of messages;
> > > >    * Add wrappers and handlers for sending/receiving these messages
> > > >    * Add state variable that track the current state of postcopy
> > > > 
> > > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > > > ---
> > > >  include/migration/migration.h |  15 ++
> > > >  include/sysemu/sysemu.h       |  23 +++
> > > >  migration/migration.c         |  13 ++
> > > >  savevm.c                      | 325 ++++++++++++++++++++++++++++++++++++++++++
> > > >  trace-events                  |  11 ++
> > > >  5 files changed, 387 insertions(+)
> > > > 
> > > > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > > > index f94af5b..81cd1f2 100644
> > > > --- a/include/migration/migration.h
> > > > +++ b/include/migration/migration.h
> > > > @@ -52,6 +52,14 @@ typedef struct MigrationState MigrationState;
> > > >  
> > > >  typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
> > > >  
> > > > +typedef enum {
> > > > +    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
> > > > +    POSTCOPY_INCOMING_ADVISE,
> > > > +    POSTCOPY_INCOMING_LISTENING,
> > > > +    POSTCOPY_INCOMING_RUNNING,
> > > > +    POSTCOPY_INCOMING_END
> > > > +} PostcopyState;
> > > > +
> > > >  /* State for the incoming migration */
> > > >  struct MigrationIncomingState {
> > > >      QEMUFile *file;
> > > > @@ -59,6 +67,8 @@ struct MigrationIncomingState {
> > > >      /* See savevm.c */
> > > >      LoadStateEntry_Head loadvm_handlers;
> > > >  
> > > > +    PostcopyState postcopy_state;
> > > > +
> > > >      QEMUFile *return_path;
> > > >      QemuMutex      rp_mutex;    /* We send replies from multiple threads */
> > > >  };
> > > > @@ -219,4 +229,9 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
> > > >                               ram_addr_t offset, size_t size,
> > > >                               int *bytes_sent);
> > > >  
> > > > +PostcopyState postcopy_state_get(MigrationIncomingState *mis);
> > > > +
> > > > +/* Set the state and return the old state */
> > > > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > > > +                                 PostcopyState new_state);
> > > >  #endif
> > > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > > > index 8da879f..d6a6d51 100644
> > > > --- a/include/sysemu/sysemu.h
> > > > +++ b/include/sysemu/sysemu.h
> > > > @@ -87,6 +87,18 @@ enum qemu_vm_cmd {
> > > >      MIG_CMD_INVALID = 0,       /* Must be 0 */
> > > >      MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
> > > >      MIG_CMD_PING,              /* Request a PONG on the RP */
> > > > +
> > > > +    MIG_CMD_POSTCOPY_ADVISE = 20,  /* Prior to any page transfers, just
> > > > +                                      warn we might want to do PC */
> > > > +    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
> > > > +                                      pages as it's running. */
> > > > +    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
> > > > +    MIG_CMD_POSTCOPY_END,          /* Postcopy is finished. */
> > > > +
> > > > +    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
> > > > +                                      were previously sent during
> > > > +                                      precopy but are dirty. */
> > > > +
> > > >  };
> > > >  
> > > >  bool qemu_savevm_state_blocked(Error **errp);
> > > > @@ -101,6 +113,17 @@ void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
> > > >                                uint16_t len, uint8_t *data);
> > > >  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
> > > >  void qemu_savevm_send_open_return_path(QEMUFile *f);
> > > > +void qemu_savevm_send_postcopy_advise(QEMUFile *f);
> > > > +void qemu_savevm_send_postcopy_listen(QEMUFile *f);
> > > > +void qemu_savevm_send_postcopy_run(QEMUFile *f);
> > > > +void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status);
> > > > +
> > > > +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
> > > > +                                           uint16_t len, uint8_t offset,
> > > > +                                           uint64_t *addrlist,
> > > > +                                           uint32_t *masklist);
> > > > +
> > > > +
> > > >  int qemu_loadvm_state(QEMUFile *f);
> > > >  
> > > >  /* SLIRP */
> > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > index 434864a..957115a 100644
> > > > --- a/migration/migration.c
> > > > +++ b/migration/migration.c
> > > > @@ -971,3 +971,16 @@ void migrate_fd_connect(MigrationState *s)
> > > >      qemu_thread_create(&s->thread, "migration", migration_thread, s,
> > > >                         QEMU_THREAD_JOINABLE);
> > > >  }
> > > > +
> > > > +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> > > > +{
> > > > +    return atomic_fetch_add(&mis->postcopy_state, 0);
> > > > +}
> > > > +
> > > > +/* Set the state and return the old state */
> > > > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > > > +                                 PostcopyState new_state)
> > > > +{
> > > > +    return atomic_xchg(&mis->postcopy_state, new_state);
> > > 
> > > Is there anything explaining what the overall atomicity requirements
> > > are for this state variable?  It's a bit hard to tell if an atomic
> > > xchg is necessary or sufficient without a description of what the
> > > overall concurrency scheme is with regards to this variable.
> > 
> > Can you tell me how to define the requirements?
> 
> Well, that is always the tricky question.
> 
> > It's a state variable tested and changed by at least two threads and
> > it's got to go through a correct sequence of states.
> > So generally you're doing a 'I expect to be in .... now change to ....'
> > so the exchange works well for that.
> 
> So.. in this case.  It seems what might make sense as a basic
> atomicity option here is a cmpxchg.  So your state_set function takes
> old_state and new_state.  It will atomically update old_state to
> new_state, returning an error if somebody else changed the state away
> from old_state in the meantime.
> 
> Then you can have a blurb next to the state_set helper explaining why
> you need this atomic op for updates to the state variable from
> multiple threads.

I can document that next to state_set, and I'll look at whether it's 
possible to move the error check down to there.

> > > > + *  n x
> > > > + *      be64   Page addresses for start of an invalidation range
> > > > + *      be32   mask of 32 pages, '1' to discard'
> > > 
> > > Is the extra compactness from this semi-sparse bitmap encoding
> > > actually worth it?  A simple list of page addresses, or address ranges
> > > to discard would be substantially simpler to get one's head around,
> > > and also seems like it might be more robust against future
> > > implementation changes as a wire format.
> > 
> > As previously discussed I really think it is;  what I'm tending to
> > see when I've been looking at these in debug is something that's
> > sparse but tends to be blobby with sets of pages discarded near
> > by.  However you do this you're going to have to walk this
> > bitmap and format out some sort of set of messages.
> 
> I'd really need to see some numbers to be convinced.  The semi-sparse
> bitmap introduces a huge amount of code and complexity at both ends.
> 
> Remember, using ranges, you can still coalesce adjacent discarded
> pages.  Even using some compat differential encodings of the range
> endpoints might work out simpler than the bitmap fragments.
> 
> Plus, ranges handle the host versus target page size distinctions very
> naturally.

I guess I'm going to have to implement this and compare then; what format
do you want?

> [snip]
> > > DISCARD will typically immediately precede LISTEN, won't it?  Is there
> > > a reason not to put the discard data into the LISTEN command?
> > 
> > Discard data can be quite large, so I potentially send multiple discard
> > commands.
> > (Also as you can tell generally I've got a preference for one message does one
> > thing, and thus I have tried to keep them separate).
> 
> So, I like the idea of one message per action in principle - but only
> if that action really is well-defined without reference to what
> operations come before and after it.  If there are hidden dependencies
> about what actions have to come in what order, I'd rather bake that
> into the command structure.

In no way is it hidden; the commands match the state transitions.

> > > > +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
> > > > +                                         uint64_t remote_hps,
> > > > +                                         uint64_t remote_tps)
> > > > +{
> > > > +    PostcopyState ps = postcopy_state_get(mis);
> > > > +    trace_loadvm_postcopy_handle_advise();
> > > > +    if (ps != POSTCOPY_INCOMING_NONE) {
> > > > +        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (remote_hps != getpagesize())  {
> > > > +        /*
> > > > +         * Some combinations of mismatch are probably possible but it gets
> > > > +         * a bit more complicated.  In particular we need to place whole
> > > > +         * host pages on the dest at once, and we need to ensure that we
> > > > +         * handle dirtying to make sure we never end up sending part of
> > > > +         * a hostpage on it's own.
> > > > +         */
> > > > +        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
> > > > +                     (int)remote_hps, getpagesize());
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (remote_tps != (1ul << qemu_target_page_bits())) {
> > > > +        /*
> > > > +         * Again, some differences could be dealt with, but for now keep it
> > > > +         * simple.
> > > > +         */
> > > > +        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
> > > > +                     (int)remote_tps, 1 << qemu_target_page_bits());
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);
> > > 
> > > Should you be checking the return value here to make sure it's still
> > > POSTCOPY_INCOMING_NONE?  Atomic xchgs seem overkill if you still have
> > > a race between the fetch at the top and the set here.
> > > 
> > > Or, in fact, should you be just doing an atomic exchange-then-check at
> > > the top, rather than checking at the top, then changing at the bottom.
> > 
> > There's no race at this point yet; going from None->advise we still only
> > have one thread.  The check at the top is a check against protocol
> > violations (e.g. getting two advise or something like that).
> 
> Yeah.. and having to have intimate knowledge of the thread structure
> at each point in order to analyze the atomic operation correctness is
> exactly what bothers me about it.

No, you don't; By always using the postcopy_state_set you don't need
to worry about the thread structure or protocol to understand the atomic operation
correctness.  The only reason you got into that comparison is because
you worried that it might be overkill in this case; by keeping it consistent
like it already is then you don't need to understand the thread structure.

Dave

> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson


--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini March 28, 2015, 3:43 p.m. UTC | #5
On 25/02/2015 17:51, Dr. David Alan Gilbert (git) wrote:
> +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> +{
> +    return atomic_fetch_add(&mis->postcopy_state, 0);
> +}
> +
> +/* Set the state and return the old state */
> +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> +                                 PostcopyState new_state)
> +{
> +    return atomic_xchg(&mis->postcopy_state, new_state);
> +}

Which are the (multiple) threads are calling these functions?

Paolo
Paolo Bonzini March 28, 2015, 3:58 p.m. UTC | #6
On 27/03/2015 05:13, David Gibson wrote:
>> It's a state variable tested and changed by at least two threads
>> and it's got to go through a correct sequence of states. So
>> generally you're doing a 'I expect to be in .... now change to
>> ....' so the exchange works well for that.
> 
> So.. in this case.  It seems what might make sense as a basic 
> atomicity option here is a cmpxchg.  So your state_set function
> takes old_state and new_state.  It will atomically update old_state
> to new_state, returning an error if somebody else changed the state
> away from old_state in the meantime.
> 
> Then you can have a blurb next to the state_set helper explaining
> why you need this atomic op for updates to the state variable from 
> multiple threads.

I agree.

My humble suggestion is:

- document the transitions, see this existing example in thread-pool.c:

    /* Moving state out of THREAD_QUEUED is protected by lock.  After
     * that, only the worker thread can write to it.  Reads and writes
     * of state and ret are ordered with memory barriers.
     */
    enum ThreadState state;

The outgoing migration state fails to document this.  My fault.

- perhaps you can use a pattern similar to what is used for the
outgoing migration state.  There one thread does the work, and the
second (could be "the others", e.g. all VCPU threads) affect the first
just by triggering changes in the migration state.  Transitions in the
first thread use cmpxchg without a loop---if it fails it means the
second thread intervened, and the thread moves on to do whatever the
second thread requested.  Transition in the second thread use cmpxchg
within a loop, in case there was a concurrent change.

Paolo

>>>> + *  n x + *      be64   Page addresses for start of an
>>>> invalidation range + *      be32   mask of 32 pages, '1' to
>>>> discard'
>>> 
>>> Is the extra compactness from this semi-sparse bitmap encoding 
>>> actually worth it?  A simple list of page addresses, or address
>>> ranges to discard would be substantially simpler to get one's
>>> head around, and also seems like it might be more robust
>>> against future implementation changes as a wire format.
>> 
>> As previously discussed I really think it is;  what I'm tending
>> to see when I've been looking at these in debug is something
>> that's sparse but tends to be blobby with sets of pages discarded
>> near by.  However you do this you're going to have to walk this 
>> bitmap and format out some sort of set of messages.
> 
> I'd really need to see some numbers to be convinced.  The
> semi-sparse bitmap introduces a huge amount of code and complexity
> at both ends.
> 
> Remember, using ranges, you can still coalesce adjacent discarded 
> pages.  Even using some compat differential encodings of the range 
> endpoints might work out simpler than the bitmap fragments.
> 
> Plus, ranges handle the host versus target page size distinctions
> very naturally.
> 
> [snip]
>>> DISCARD will typically immediately precede LISTEN, won't it?
>>> Is there a reason not to put the discard data into the LISTEN
>>> command?
>> 
>> Discard data can be quite large, so I potentially send multiple
>> discard commands. (Also as you can tell generally I've got a
>> preference for one message does one thing, and thus I have tried
>> to keep them separate).
> 
> So, I like the idea of one message per action in principle - but
> only if that action really is well-defined without reference to
> what operations come before and after it.  If there are hidden
> dependencies about what actions have to come in what order, I'd
> rather bake that into the command structure.
> 
>>>> +static int
>>>> loadvm_postcopy_handle_advise(MigrationIncomingState *mis, +
>>>> uint64_t remote_hps, +
>>>> uint64_t remote_tps) +{ +    PostcopyState ps =
>>>> postcopy_state_get(mis); +
>>>> trace_loadvm_postcopy_handle_advise(); +    if (ps !=
>>>> POSTCOPY_INCOMING_NONE) { +
>>>> error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state
>>>> (%d)", ps); +        return -1; +    } + +    if (remote_hps
>>>> != getpagesize())  { +        /* +         * Some
>>>> combinations of mismatch are probably possible but it gets +
>>>> * a bit more complicated.  In particular we need to place
>>>> whole +         * host pages on the dest at once, and we need
>>>> to ensure that we +         * handle dirtying to make sure we
>>>> never end up sending part of +         * a hostpage on it's
>>>> own. +         */ +        error_report("Postcopy needs
>>>> matching host page sizes (s=%d d=%d)", +
>>>> (int)remote_hps, getpagesize()); +        return -1; +    } 
>>>> + +    if (remote_tps != (1ul << qemu_target_page_bits())) { 
>>>> +        /* +         * Again, some differences could be
>>>> dealt with, but for now keep it +         * simple. +
>>>> */ +        error_report("Postcopy needs matching target page
>>>> sizes (s=%d d=%d)", +                     (int)remote_tps, 1
>>>> << qemu_target_page_bits()); +        return -1; +    } + +
>>>> postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);
>>> 
>>> Should you be checking the return value here to make sure it's
>>> still POSTCOPY_INCOMING_NONE?  Atomic xchgs seem overkill if
>>> you still have a race between the fetch at the top and the set
>>> here.
>>> 
>>> Or, in fact, should you be just doing an atomic
>>> exchange-then-check at the top, rather than checking at the
>>> top, then changing at the bottom.
>> 
>> There's no race at this point yet; going from None->advise we
>> still only have one thread.  The check at the top is a check
>> against protocol violations (e.g. getting two advise or something
>> like that).
> 
> Yeah.. and having to have intimate knowledge of the thread
> structure at each point in order to analyze the atomic operation
> correctness is exactly what bothers me about it.
>
Paolo Bonzini March 28, 2015, 4 p.m. UTC | #7
On 27/03/2015 11:48, Dr. David Alan Gilbert wrote:
> > So, I like the idea of one message per action in principle - but only
> > if that action really is well-defined without reference to what
> > operations come before and after it.  If there are hidden dependencies
> > about what actions have to come in what order, I'd rather bake that
> > into the command structure.
> 
> In no way is it hidden; the commands match the state transitions.

So is there only one writer thread?  In which states can the other
threads read the postcopy state, and why is sequential consistency
important? That is, in threads other than the writer, how does reading
the state interact with other reads and writes?

Paolo
David Gibson March 30, 2015, 4:03 a.m. UTC | #8
On Fri, Mar 27, 2015 at 10:48:52AM +0000, Dr. David Alan Gilbert wrote:
> * David Gibson (david@gibson.dropbear.id.au) wrote:
> > On Thu, Mar 26, 2015 at 04:33:28PM +0000, Dr. David Alan Gilbert wrote:
> > > (Only replying to some of the items in this mail - the others I'll get
> > > to another time).
[snip]
> > > > DISCARD will typically immediately precede LISTEN, won't it?  Is there
> > > > a reason not to put the discard data into the LISTEN command?
> > > 
> > > Discard data can be quite large, so I potentially send multiple discard
> > > commands.
> > > (Also as you can tell generally I've got a preference for one message does one
> > > thing, and thus I have tried to keep them separate).
> > 
> > So, I like the idea of one message per action in principle - but only
> > if that action really is well-defined without reference to what
> > operations come before and after it.  If there are hidden dependencies
> > about what actions have to come in what order, I'd rather bake that
> > into the command structure.
> 
> In no way is it hidden; the commands match the state transitions.

Not all of them.  In particular DISCARD's relation to state
transitions is not at all obvious.  Likewise I don't think the
connection of LISTEN to the internal state machines at each end is
terribly clear.

> > > > > +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
> > > > > +                                         uint64_t remote_hps,
> > > > > +                                         uint64_t remote_tps)
> > > > > +{
> > > > > +    PostcopyState ps = postcopy_state_get(mis);
> > > > > +    trace_loadvm_postcopy_handle_advise();
> > > > > +    if (ps != POSTCOPY_INCOMING_NONE) {
> > > > > +        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (remote_hps != getpagesize())  {
> > > > > +        /*
> > > > > +         * Some combinations of mismatch are probably possible but it gets
> > > > > +         * a bit more complicated.  In particular we need to place whole
> > > > > +         * host pages on the dest at once, and we need to ensure that we
> > > > > +         * handle dirtying to make sure we never end up sending part of
> > > > > +         * a hostpage on it's own.
> > > > > +         */
> > > > > +        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
> > > > > +                     (int)remote_hps, getpagesize());
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (remote_tps != (1ul << qemu_target_page_bits())) {
> > > > > +        /*
> > > > > +         * Again, some differences could be dealt with, but for now keep it
> > > > > +         * simple.
> > > > > +         */
> > > > > +        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
> > > > > +                     (int)remote_tps, 1 << qemu_target_page_bits());
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);
> > > > 
> > > > Should you be checking the return value here to make sure it's still
> > > > POSTCOPY_INCOMING_NONE?  Atomic xchgs seem overkill if you still have
> > > > a race between the fetch at the top and the set here.
> > > > 
> > > > Or, in fact, should you be just doing an atomic exchange-then-check at
> > > > the top, rather than checking at the top, then changing at the bottom.
> > > 
> > > There's no race at this point yet; going from None->advise we still only
> > > have one thread.  The check at the top is a check against protocol
> > > violations (e.g. getting two advise or something like that).
> > 
> > Yeah.. and having to have intimate knowledge of the thread structure
> > at each point in order to analyze the atomic operation correctness is
> > exactly what bothers me about it.
> 
> No, you don't; By always using the postcopy_state_set you don't need
> to worry about the thread structure or protocol to understand the atomic operation
> correctness.  The only reason you got into that comparison is because
> you worried that it might be overkill in this case; by keeping it consistent
> like it already is then you don't need to understand the thread structure.

You really do, though.  You may be using the same function in the
original, but not the same idiom: in this case you don't check the
return value and deal with it accordingly.

Because of that, this looks pretty much exactly like the classic
"didn't understand what was atomic in the atomic" race condition, and
you need to know that there's only one thread at this point to realize
it's correct after all.
Dr. David Alan Gilbert March 30, 2015, 5:46 p.m. UTC | #9
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 25/02/2015 17:51, Dr. David Alan Gilbert (git) wrote:
> > +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> > +{
> > +    return atomic_fetch_add(&mis->postcopy_state, 0);
> > +}
> > +
> > +/* Set the state and return the old state */
> > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> > +                                 PostcopyState new_state)
> > +{
> > +    return atomic_xchg(&mis->postcopy_state, new_state);
> > +}
> 
> Which are the (multiple) threads are calling these functions?

The main thread receiving the migration and the postcopy ram_listen_thread
receiving the RAM pages.
It's not actually racy between multiple threads updating it,
it's sequenced so that the main thread initialises it and then
hands over to the listen thread that takes it over from that point.

Dave

> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini March 30, 2015, 7:23 p.m. UTC | #10
On 30/03/2015 19:46, Dr. David Alan Gilbert wrote:
>>> > > +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
>>> > > +{
>>> > > +    return atomic_fetch_add(&mis->postcopy_state, 0);
>>> > > +}
>>> > > +
>>> > > +/* Set the state and return the old state */
>>> > > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
>>> > > +                                 PostcopyState new_state)
>>> > > +{
>>> > > +    return atomic_xchg(&mis->postcopy_state, new_state);
>>> > > +}
>> > 
>> > Which are the (multiple) threads are calling these functions?
> The main thread receiving the migration and the postcopy ram_listen_thread
> receiving the RAM pages.
> It's not actually racy between multiple threads updating it,
> it's sequenced so that the main thread initialises it and then
> hands over to the listen thread that takes it over from that point.

I would use atomic_mb_read/atomic_mb_set here.

Paolo
Dr. David Alan Gilbert March 31, 2015, 11:05 a.m. UTC | #11
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 30/03/2015 19:46, Dr. David Alan Gilbert wrote:
> >>> > > +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
> >>> > > +{
> >>> > > +    return atomic_fetch_add(&mis->postcopy_state, 0);
> >>> > > +}
> >>> > > +
> >>> > > +/* Set the state and return the old state */
> >>> > > +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
> >>> > > +                                 PostcopyState new_state)
> >>> > > +{
> >>> > > +    return atomic_xchg(&mis->postcopy_state, new_state);
> >>> > > +}
> >> > 
> >> > Which are the (multiple) threads are calling these functions?
> > The main thread receiving the migration and the postcopy ram_listen_thread
> > receiving the RAM pages.
> > It's not actually racy between multiple threads updating it,
> > it's sequenced so that the main thread initialises it and then
> > hands over to the listen thread that takes it over from that point.
> 
> I would use atomic_mb_read/atomic_mb_set here.

The benefit of the atomic_xchg is that I get the old value back so
that I can sanity check I was in the state I thought I should have
been.

Dave

> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini March 31, 2015, 11:10 a.m. UTC | #12
On 31/03/2015 13:05, Dr. David Alan Gilbert wrote:
> * Paolo Bonzini (pbonzini@redhat.com) wrote:
>>
>>
>> On 30/03/2015 19:46, Dr. David Alan Gilbert wrote:
>>>>>>> +PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
>>>>>>> +{
>>>>>>> +    return atomic_fetch_add(&mis->postcopy_state, 0);
>>>>>>> +}
>>>>>>> +
>>>>>>> +/* Set the state and return the old state */
>>>>>>> +PostcopyState postcopy_state_set(MigrationIncomingState *mis,
>>>>>>> +                                 PostcopyState new_state)
>>>>>>> +{
>>>>>>> +    return atomic_xchg(&mis->postcopy_state, new_state);
>>>>>>> +}
>>>>>
>>>>> Which are the (multiple) threads are calling these functions?
>>> The main thread receiving the migration and the postcopy ram_listen_thread
>>> receiving the RAM pages.
>>> It's not actually racy between multiple threads updating it,
>>> it's sequenced so that the main thread initialises it and then
>>> hands over to the listen thread that takes it over from that point.
>>
>> I would use atomic_mb_read/atomic_mb_set here.
> 
> The benefit of the atomic_xchg is that I get the old value back so
> that I can sanity check I was in the state I thought I should have
> been.

Ok, you can still pair atomic_xchg with atomic_mb_read, it's a bit cheaper.

Paolo
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index f94af5b..81cd1f2 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -52,6 +52,14 @@  typedef struct MigrationState MigrationState;
 
 typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
 
+typedef enum {
+    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
+    POSTCOPY_INCOMING_ADVISE,
+    POSTCOPY_INCOMING_LISTENING,
+    POSTCOPY_INCOMING_RUNNING,
+    POSTCOPY_INCOMING_END
+} PostcopyState;
+
 /* State for the incoming migration */
 struct MigrationIncomingState {
     QEMUFile *file;
@@ -59,6 +67,8 @@  struct MigrationIncomingState {
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
 
+    PostcopyState postcopy_state;
+
     QEMUFile *return_path;
     QemuMutex      rp_mutex;    /* We send replies from multiple threads */
 };
@@ -219,4 +229,9 @@  size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
                              ram_addr_t offset, size_t size,
                              int *bytes_sent);
 
+PostcopyState postcopy_state_get(MigrationIncomingState *mis);
+
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(MigrationIncomingState *mis,
+                                 PostcopyState new_state);
 #endif
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 8da879f..d6a6d51 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -87,6 +87,18 @@  enum qemu_vm_cmd {
     MIG_CMD_INVALID = 0,       /* Must be 0 */
     MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
     MIG_CMD_PING,              /* Request a PONG on the RP */
+
+    MIG_CMD_POSTCOPY_ADVISE = 20,  /* Prior to any page transfers, just
+                                      warn we might want to do PC */
+    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
+                                      pages as it's running. */
+    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
+    MIG_CMD_POSTCOPY_END,          /* Postcopy is finished. */
+
+    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
+                                      were previously sent during
+                                      precopy but are dirty. */
+
 };
 
 bool qemu_savevm_state_blocked(Error **errp);
@@ -101,6 +113,17 @@  void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
                               uint16_t len, uint8_t *data);
 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
 void qemu_savevm_send_open_return_path(QEMUFile *f);
+void qemu_savevm_send_postcopy_advise(QEMUFile *f);
+void qemu_savevm_send_postcopy_listen(QEMUFile *f);
+void qemu_savevm_send_postcopy_run(QEMUFile *f);
+void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status);
+
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+                                           uint16_t len, uint8_t offset,
+                                           uint64_t *addrlist,
+                                           uint32_t *masklist);
+
+
 int qemu_loadvm_state(QEMUFile *f);
 
 /* SLIRP */
diff --git a/migration/migration.c b/migration/migration.c
index 434864a..957115a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -971,3 +971,16 @@  void migrate_fd_connect(MigrationState *s)
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
 }
+
+PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
+{
+    return atomic_fetch_add(&mis->postcopy_state, 0);
+}
+
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(MigrationIncomingState *mis,
+                                 PostcopyState new_state)
+{
+    return atomic_xchg(&mis->postcopy_state, new_state);
+}
+
diff --git a/savevm.c b/savevm.c
index 4b619da..e31ccb0 100644
--- a/savevm.c
+++ b/savevm.c
@@ -39,6 +39,7 @@ 
 #include "exec/memory.h"
 #include "qmp-commands.h"
 #include "trace.h"
+#include "qemu/bitops.h"
 #include "qemu/iov.h"
 #include "block/snapshot.h"
 #include "block/qapi.h"
@@ -635,6 +636,90 @@  void qemu_savevm_send_open_return_path(QEMUFile *f)
     qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
 }
 
+/* Send prior to any postcopy transfer */
+void qemu_savevm_send_postcopy_advise(QEMUFile *f)
+{
+    uint64_t tmp[2];
+    tmp[0] = cpu_to_be64(getpagesize());
+    tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());
+
+    trace_qemu_savevm_send_postcopy_advise();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
+}
+
+/* Prior to running, to cause pages that have been dirtied after precopy
+ * started to be discarded on the destination.
+ * CMD_POSTCOPY_RAM_DISCARD consist of:
+ *  3 byte header (filled in by qemu_savevm_send_postcopy_ram_discard)
+ *      byte   version (0)
+ *      byte   offset to be subtracted from each page address to deal with
+ *             RAMBlocks that don't start on a mask word boundary.
+ *      byte   Length of name field
+ *  n x byte   RAM block name (NOT 0 terminated)
+ *  n x
+ *      be64   Page addresses for start of an invalidation range
+ *      be32   mask of 32 pages, '1' to discard'
+ *
+ *  Hopefully this is pretty sparse so we don't get too many entries,
+ *  and using the mask should deal with most pagesize differences
+ *  just ending up as a single full mask
+ *
+ * The mask is always 32bits irrespective of the long size
+ *
+ *  name:  RAMBlock name that these entries are part of
+ *  len: Number of page entries
+ *  addrlist: 'len' addresses
+ *  masklist: 'len' masks (corresponding to the addresses)
+ */
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+                                           uint16_t len, uint8_t offset,
+                                           uint64_t *addrlist,
+                                           uint32_t *masklist)
+{
+    uint8_t *buf;
+    uint16_t tmplen;
+    uint16_t t;
+
+    trace_qemu_savevm_send_postcopy_ram_discard();
+    buf = g_malloc0(len*12 + strlen(name) + 3);
+    buf[0] = 0; /* Version */
+    buf[1] = offset;
+    assert(strlen(name) < 256);
+    buf[2] = strlen(name);
+    memcpy(buf+3, name, strlen(name));
+    tmplen = 3+strlen(name);
+
+    for (t = 0; t < len; t++) {
+        cpu_to_be64w((uint64_t *)(buf + tmplen), addrlist[t]);
+        tmplen += 8;
+        cpu_to_be32w((uint32_t *)(buf + tmplen), masklist[t]);
+        tmplen += 4;
+    }
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
+    g_free(buf);
+}
+
+/* Get the destination into a state where it can receive postcopy data. */
+void qemu_savevm_send_postcopy_listen(QEMUFile *f)
+{
+    trace_savevm_send_postcopy_listen();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
+}
+
+/* Kick the destination into running */
+void qemu_savevm_send_postcopy_run(QEMUFile *f)
+{
+    trace_savevm_send_postcopy_run();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
+}
+
+/* End of postcopy - with a status byte; 0 is good, anything else is a fail */
+void qemu_savevm_send_postcopy_end(QEMUFile *f, uint8_t status)
+{
+    trace_savevm_send_postcopy_end();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_END, 1, &status);
+}
+
 bool qemu_savevm_state_blocked(Error **errp)
 {
     SaveStateEntry *se;
@@ -961,6 +1046,212 @@  enum LoadVMExitCodes {
 
 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
 
+/* ------ incoming postcopy messages ------ */
+/* 'advise' arrives before any transfers just to tell us that a postcopy
+ * *might* happen - it might be skipped if precopy transferred everything
+ * quickly.
+ */
+static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
+                                         uint64_t remote_hps,
+                                         uint64_t remote_tps)
+{
+    PostcopyState ps = postcopy_state_get(mis);
+    trace_loadvm_postcopy_handle_advise();
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    if (remote_hps != getpagesize())  {
+        /*
+         * Some combinations of mismatch are probably possible but it gets
+         * a bit more complicated.  In particular we need to place whole
+         * host pages on the dest at once, and we need to ensure that we
+         * handle dirtying to make sure we never end up sending part of
+         * a hostpage on it's own.
+         */
+        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
+                     (int)remote_hps, getpagesize());
+        return -1;
+    }
+
+    if (remote_tps != (1ul << qemu_target_page_bits())) {
+        /*
+         * Again, some differences could be dealt with, but for now keep it
+         * simple.
+         */
+        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
+                     (int)remote_tps, 1 << qemu_target_page_bits());
+        return -1;
+    }
+
+    postcopy_state_set(mis, POSTCOPY_INCOMING_ADVISE);
+
+    return 0;
+}
+
+/* After postcopy we will be told to throw some pages away since they're
+ * dirty and will have to be demand fetched.  Must happen before CPU is
+ * started.
+ * There can be 0..many of these messages, each encoding multiple pages.
+ * Bits set in the message represent a page in the source VMs bitmap, but
+ * since the guest/target page sizes can be different on s/d then we have
+ * to convert.
+ */
+static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
+                                              uint16_t len)
+{
+    int tmp;
+    unsigned int first_bit_offset;
+    char ramid[256];
+    PostcopyState ps = postcopy_state_get(mis);
+
+    trace_loadvm_postcopy_ram_handle_discard();
+
+    if (ps != POSTCOPY_INCOMING_ADVISE) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
+                     ps);
+        return -1;
+    }
+    /* We're expecting a
+     *    3 byte header,
+     *    a RAM ID string
+     *    then at least 1 12 byte chunks
+    */
+    if (len < 16) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+        return -1;
+    }
+
+    tmp = qemu_get_byte(mis->file);
+    if (tmp != 0) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
+        return -1;
+    }
+    first_bit_offset = qemu_get_byte(mis->file);
+
+    if (qemu_get_counted_string(mis->file, ramid)) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
+        return -1;
+    }
+
+    len -= 3+strlen(ramid);
+    if (len % 12) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+        return -1;
+    }
+    while (len) {
+        uint64_t startaddr;
+        uint32_t mask;
+        /*
+         * We now have pairs of address, mask
+         *   Each word of mask is 32 bits, where each bit corresponds to one
+         *   target page.
+         *   RAMBlocks don't necessarily start on word boundaries,
+         *   and the offset in the header indicates the offset into the 1st
+         *   mask word that corresponds to the 1st page of the RAMBlock.
+         */
+        startaddr = qemu_get_be64(mis->file);
+        mask = qemu_get_be32(mis->file);
+
+        len -= 12;
+
+        while (mask) {
+            /* mask= .....?10...0 */
+            /*             ^fs    */
+            int firstset = ctz32(mask);
+
+            /* tmp32=.....?11...1 */
+            /*             ^fs    */
+            uint32_t tmp32 = mask | ((((uint32_t)1)<<firstset)-1);
+
+            /* mask= .?01..10...0 */
+            /*         ^fz ^fs    */
+            int firstzero = cto32(tmp32);
+
+            if ((startaddr == 0) && (firstset < first_bit_offset)) {
+                error_report("CMD_POSTCOPY_RAM_DISCARD bad data; bit set"
+                               " prior to block; block=%s offset=%d"
+                               " firstset=%d\n", ramid, first_bit_offset,
+                               firstzero);
+                return -1;
+            }
+
+            /*
+             * we know there must be at least 1 bit set due to the loop entry
+             * If there is no 0 firstzero will be 32
+             */
+            /* TODO - ram_discard_range gets added in a later patch
+            int ret = ram_discard_range(mis, ramid,
+                                startaddr + firstset - first_bit_offset,
+                                startaddr + (firstzero - 1) - first_bit_offset);
+            ret = -1;
+            if (ret) {
+                return ret;
+            }
+            */
+
+            /* mask= .?0000000000 */
+            /*         ^fz ^fs    */
+            if (firstzero != 32) {
+                mask &= (((uint32_t)-1) << firstzero);
+            } else {
+                mask = 0;
+            }
+        }
+    }
+    trace_loadvm_postcopy_ram_handle_discard_end();
+
+    return 0;
+}
+
+/* After this message we must be able to immediately receive postcopy data */
+static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(mis, POSTCOPY_INCOMING_LISTENING);
+    trace_loadvm_postcopy_handle_listen();
+    if (ps != POSTCOPY_INCOMING_ADVISE) {
+        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    /* TODO start up the postcopy listening thread */
+    return 0;
+}
+
+/* After all discards we can start running and asking for pages */
+static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(mis, POSTCOPY_INCOMING_RUNNING);
+    trace_loadvm_postcopy_handle_run();
+    if (ps != POSTCOPY_INCOMING_LISTENING) {
+        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    if (autostart) {
+        /* Hold onto your hats, starting the CPU */
+        vm_start();
+    } else {
+        /* leave it paused and let management decide when to start the CPU */
+        runstate_set(RUN_STATE_PAUSED);
+    }
+
+    return 0;
+}
+
+/* The end - with a byte from the source which can tell us to fail. */
+static int loadvm_postcopy_handle_end(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_get(mis);
+    trace_loadvm_postcopy_handle_end();
+    if (ps == POSTCOPY_INCOMING_NONE) {
+        error_report("CMD_POSTCOPY_END in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+    return -1; /* TODO - expecting 1 byte good/fail */
+}
+
 static int loadvm_process_command_simple_lencheck(const char *name,
                                                   unsigned int actual,
                                                   unsigned int expected)
@@ -986,6 +1277,7 @@  static int loadvm_process_command(QEMUFile *f)
     uint16_t com;
     uint16_t len;
     uint32_t tmp32;
+    uint64_t tmp64a, tmp64b;
 
     com = qemu_get_be16(f);
     len = qemu_get_be16(f);
@@ -1023,6 +1315,39 @@  static int loadvm_process_command(QEMUFile *f)
         migrate_send_rp_pong(mis, tmp32);
         break;
 
+    case MIG_CMD_POSTCOPY_ADVISE:
+        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_ADVISE",
+                                                   len, 16)) {
+            return -1;
+        }
+        tmp64a = qemu_get_be64(f); /* hps */
+        tmp64b = qemu_get_be64(f); /* tps */
+        return loadvm_postcopy_handle_advise(mis, tmp64a, tmp64b);
+
+    case MIG_CMD_POSTCOPY_LISTEN:
+        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_LISTEN",
+                                                   len, 0)) {
+            return -1;
+        }
+        return loadvm_postcopy_handle_listen(mis);
+
+    case MIG_CMD_POSTCOPY_RUN:
+        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_RUN",
+                                                   len, 0)) {
+            return -1;
+        }
+        return loadvm_postcopy_handle_run(mis);
+
+    case MIG_CMD_POSTCOPY_END:
+        if (loadvm_process_command_simple_lencheck("CMD_POSTCOPY_END",
+                                                   len, 1)) {
+            return -1;
+        }
+        return loadvm_postcopy_handle_end(mis);
+
+    case MIG_CMD_POSTCOPY_RAM_DISCARD:
+        return loadvm_postcopy_ram_handle_discard(mis, len);
+
     default:
         error_report("VM_COMMAND 0x%x unknown (len 0x%x)", com, len);
         return -1;
diff --git a/trace-events b/trace-events
index 4ff55fe..050f553 100644
--- a/trace-events
+++ b/trace-events
@@ -1171,11 +1171,22 @@  qemu_loadvm_state_main(void) ""
 qemu_loadvm_state_main_quit_parent(void) ""
 qemu_loadvm_state_post_main(int ret) "%d"
 qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
+loadvm_postcopy_handle_advise(void) ""
+loadvm_postcopy_handle_end(void) ""
+loadvm_postcopy_handle_listen(void) ""
+loadvm_postcopy_handle_run(void) ""
+loadvm_postcopy_ram_handle_discard(void) ""
+loadvm_postcopy_ram_handle_discard_end(void) ""
 loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
 loadvm_process_command_ping(uint32_t val) "%x"
+qemu_savevm_send_postcopy_advise(void) ""
+qemu_savevm_send_postcopy_ram_discard(void) ""
 savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
 savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
 savevm_send_ping(uint32_t val) "%x"
+savevm_send_postcopy_end(void) ""
+savevm_send_postcopy_listen(void) ""
+savevm_send_postcopy_run(void) ""
 savevm_state_begin(void) ""
 savevm_state_header(void) ""
 savevm_state_iterate(void) ""