diff mbox

[4/6] migration: calculate downtime on dst side

Message ID 1492175840-5021-5-git-send-email-a.perevalov@samsung.com
State New
Headers show

Commit Message

Alexey Perevalov April 14, 2017, 1:17 p.m. UTC
This patch provides downtime calculation per vCPU,
as a summary and as a overlapped value for all vCPUs.

This approach just keeps tree with page fault addr as a key,
and t1-t2 interval of pagefault time and page copy time, with
affected vCPU bit mask.
For more implementation details please see comment to
get_postcopy_total_downtime function.

Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
---
 include/migration/migration.h |  14 +++
 migration/migration.c         | 280 +++++++++++++++++++++++++++++++++++++++++-
 migration/postcopy-ram.c      |  24 +++-
 migration/qemu-file.c         |   1 -
 migration/trace-events        |   9 +-
 5 files changed, 323 insertions(+), 5 deletions(-)

Comments

Dr. David Alan Gilbert April 21, 2017, noon UTC | #1
* Alexey Perevalov (a.perevalov@samsung.com) wrote:
> This patch provides downtime calculation per vCPU,
> as a summary and as a overlapped value for all vCPUs.
> 
> This approach just keeps tree with page fault addr as a key,
> and t1-t2 interval of pagefault time and page copy time, with
> affected vCPU bit mask.
> For more implementation details please see comment to
> get_postcopy_total_downtime function.
> 
> Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
> ---
>  include/migration/migration.h |  14 +++
>  migration/migration.c         | 280 +++++++++++++++++++++++++++++++++++++++++-
>  migration/postcopy-ram.c      |  24 +++-
>  migration/qemu-file.c         |   1 -
>  migration/trace-events        |   9 +-
>  5 files changed, 323 insertions(+), 5 deletions(-)
> 
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 5720c88..5d2c628 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -123,10 +123,24 @@ struct MigrationIncomingState {
>  
>      /* See savevm.c */
>      LoadStateEntry_Head loadvm_handlers;
> +
> +    /*
> +     *  Tree for keeping postcopy downtime,
> +     *  necessary to calculate correct downtime, during multiple
> +     *  vm suspends, it keeps host page address as a key and
> +     *  DowntimeDuration as a data
> +     *  NULL means kernel couldn't provide process thread id,
> +     *  and QEMU couldn't identify which vCPU raise page fault
> +     */
> +    GTree *postcopy_downtime;
>  };
>  
>  MigrationIncomingState *migration_incoming_get_current(void);
>  void migration_incoming_state_destroy(void);
> +void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
> +void mark_postcopy_downtime_end(uint64_t addr);
> +uint64_t get_postcopy_total_downtime(void);
> +void destroy_downtime_duration(gpointer data);
>  
>  /*
>   * An outstanding page request, on the source, having been received
> diff --git a/migration/migration.c b/migration/migration.c
> index 79f6425..5bac434 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -38,6 +38,8 @@
>  #include "io/channel-tls.h"
>  #include "migration/colo.h"
>  
> +#define DEBUG_VCPU_DOWNTIME 1
> +
>  #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
>  
>  /* Amount of time to allocate to each "chunk" of bandwidth-throttled
> @@ -77,6 +79,19 @@ static NotifierList migration_state_notifiers =
>  
>  static bool deferred_incoming;
>  
> +typedef struct {
> +    int64_t begin;
> +    int64_t end;
> +    uint64_t *cpus; /* cpus bit mask array, QEMU bit functions support
> +     bit operation on memory regions, but doesn't check out of range */
> +} DowntimeDuration;
> +
> +typedef struct {
> +    int64_t tp; /* point in time */
> +    bool is_end;
> +    uint64_t *cpus;
> +} OverlapDowntime;
> +
>  /*
>   * Current state of incoming postcopy; note this is not part of
>   * MigrationIncomingState since it's state is used during cleanup
> @@ -117,6 +132,13 @@ MigrationState *migrate_get_current(void)
>      return &current_migration;
>  }
>  
> +void destroy_downtime_duration(gpointer data)
> +{
> +    DowntimeDuration *dd = (DowntimeDuration *)data;
> +    g_free(dd->cpus);
> +    g_free(data);
> +}
> +
>  MigrationIncomingState *migration_incoming_get_current(void)
>  {
>      static bool once;
> @@ -138,10 +160,13 @@ void migration_incoming_state_destroy(void)
>      struct MigrationIncomingState *mis = migration_incoming_get_current();
>  
>      qemu_event_destroy(&mis->main_thread_load_event);
> +    if (mis->postcopy_downtime) {
> +        g_tree_destroy(mis->postcopy_downtime);
> +        mis->postcopy_downtime = NULL;
> +    }
>      loadvm_free_handlers(mis);
>  }
>  
> -
>  typedef struct {
>      bool optional;
>      uint32_t size;
> @@ -1754,7 +1779,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
>       */
>      ms->postcopy_after_devices = true;
>      notifier_list_notify(&migration_state_notifiers, ms);
> -

Stray deletion

>      ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
>  
>      qemu_mutex_unlock_iothread();
> @@ -2117,3 +2141,255 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
>      return atomic_xchg(&incoming_postcopy_state, new_state);
>  }
>  
> +#define SIZE_TO_KEEP_CPUBITS (1 + smp_cpus/sizeof(guint64))

Split out your cpu-sets so that you have an 'alloc_cpu_set',
a 'set bit' a 'set all bits', dup etc
(I see Linux has cpumask.h that has a 'cpu_set' that's
basically the same thing, but we need something portablish.)

> +void mark_postcopy_downtime_begin(uint64_t addr, int cpu)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +    DowntimeDuration *dd;
> +    if (!mis->postcopy_downtime) {
> +        return;
> +    }
> +
> +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast */
> +    if (!dd) {
> +        dd = (DowntimeDuration *)g_new0(DowntimeDuration, 1);
> +        dd->cpus = g_new0(guint64, SIZE_TO_KEEP_CPUBITS);
> +        g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
> +    }
> +
> +    if (cpu < 0) {
> +        /* assume in this situation all vCPUs are sleeping */
> +        int i;
> +        for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> +            dd->cpus[i] = ~(uint64_t)0u;
> +        }
> +    } else
> +        set_bit(cpu, dd->cpus);

Qemu coding style: Use {}'s even on one line blocks

> +
> +    /*
> +     *  overwrite previously set dd->begin, if that page already was
> +     *     faulted on another cpu
> +     */
> +    dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);

OK, so this is making a decision that needs to be documented;
that is that if one CPU was already paused at time (a), then a second
CPU we see is paused at time (b), then the time  we record only starts
at (b) and ignores the time from a..b  - is that the way you want to do it?
As I say, it should be documented somewhere; it's probably worth
adding something to docs/migration.txt about how this measurement works.


> +    trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
> +}
> +
> +void mark_postcopy_downtime_end(uint64_t addr)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +    DowntimeDuration *dd;
> +    if (!mis->postcopy_downtime) {
> +        return;
> +    }
> +
> +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
> +    if (!dd) {
> +        /* error_report("Could not populate downtime duration completion time \n\
> +                        There is no downtime duration for 0x%"PRIx64, addr); */

Error or no error - decide!   Is this happening for pages that arrive before
they've been requested?

> +        return;
> +    }
> +
> +    dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +    trace_mark_postcopy_downtime_end(addr, dd, dd->end);
> +}
> +
> +struct downtime_overlay_cxt {
> +    GPtrArray *downtime_points;
> +    size_t number_of_points;
> +};

Why 'cxt' ? If you mean as an abbreviation to context, then we normally use ctxt.

> +/*
> + * This function split each DowntimeDuration, which represents as start/end
> + * pointand makes a points of it, then fill array with points,
> + * to sort it in future.
> + */
> +static gboolean split_duration_and_fill_points(gpointer key, gpointer value,
> +                                        gpointer data)
> +{
> +    struct downtime_overlay_cxt *ctx = (struct downtime_overlay_cxt *)data;
> +    DowntimeDuration *dd = (DowntimeDuration *)value;
> +    GPtrArray *interval = ctx->downtime_points;
> +    if (dd->begin) {
> +        OverlapDowntime *od_begin = g_new0(OverlapDowntime, 1);
> +        od_begin->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> +        od_begin->tp = dd->begin;
> +        od_begin->is_end = false;
> +        g_ptr_array_add(interval, od_begin);
> +        ctx->number_of_points += 1;
> +    }
> +
> +    if (dd->end) {
> +        OverlapDowntime *od_end = g_new0(OverlapDowntime, 1);
> +        od_end->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> +        od_end->tp = dd->end;
> +        od_end->is_end = true;
> +        g_ptr_array_add(interval, od_end);
> +        ctx->number_of_points += 1;
> +    }
> +
> +    if (dd->end && dd->begin)
> +        trace_split_duration_and_fill_points(dd->end - dd->begin, (uint64_t)key);

again, need {}'s

> +    return FALSE;
> +}
> +
> +#ifdef DEBUG_VCPU_DOWNTIME
> +static gboolean calculate_per_cpu(gpointer key, gpointer value,
> +                                  gpointer data)
> +{
> +    int *downtime_cpu = (int *)data;
> +    DowntimeDuration *dd = (DowntimeDuration *)value;
> +    int cpu_iter;
> +    for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
> +        if (test_bit(cpu_iter, dd->cpus) && dd->end && dd->begin)
> +            downtime_cpu[cpu_iter] += dd->end - dd->begin;
> +    }
> +    return FALSE;
> +}
> +#endif /* DEBUG_VCPU_DOWNTIME */
> +
> +static gint compare_downtime(gconstpointer a, gconstpointer b)
> +{
> +    DowntimeDuration *dda = (DowntimeDuration *)a;
> +    DowntimeDuration *ddb = (DowntimeDuration *)b;
> +    return dda->begin - ddb->begin;
> +}
> +
> +static void destroy_overlap_downtime(gpointer data)
> +{
> +    OverlapDowntime *od = (OverlapDowntime *)data;
> +    g_free(od->cpus);
> +    g_free(data);
> +}
> +
> +static int check_overlap(uint64_t *b)
> +{
> +    unsigned long zero_bit = find_first_zero_bit(b, BITS_PER_LONG * SIZE_TO_KEEP_CPUBITS);

Line's too long.

> +    return zero_bit >= smp_cpus;

So this is really 'all cpus are blocked'?

> +}
> +
> +/*
> + * This function calculates downtime per cpu and trace it
> + *
> + *  Also it calculates total downtime as an interval's overlap,
> + *  for many vCPU.
> + *
> + *  The approach is following:
> + *  Initially intervals are represented in tree where key is
> + *  pagefault address, and values:
> + *   begin - page fault time
> + *   end   - page load time
> + *   cpus  - bit mask shows affected cpus
> + *
> + *  To calculate overlap on all cpus, intervals converted into
> + *  array of points in time (downtime_points), the size of
> + *  array is 2 * number of nodes in tree of intervals (2 array
> + *  elements per one in element of interval).
> + *  Each element is marked as end (E) or as start (S) of interval.
> + *  The overlap downtime will be calculated for SE, only in case
> + *  there is sequence S(0..N)E(M) for every vCPU.
> + *
> + * As example we have 3 CPU
> + *
> + *      S1        E1           S1               E1
> + * -----***********------------xxx***************------------------------> CPU1
> + *
> + *             S2                E2
> + * ------------****************xxx---------------------------------------> CPU2
> + *
> + *                         S3            E3
> + * ------------------------****xxx********-------------------------------> CPU3
> + *
> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
> + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
                       ^ typo

> + * Legend of picture is following: * - means downtime per vCPU
> + *                                 x - means overlapped downtime
> + */
> +uint64_t get_postcopy_total_downtime(void)
> +{
> +    MigrationIncomingState *mis = migration_incoming_get_current();
> +    uint64_t total_downtime = 0; /* for total overlapped downtime */
> +    const int intervals = g_tree_nnodes(mis->postcopy_downtime);
> +    int point_iter, start_point_iter, i;
> +    struct downtime_overlay_cxt dp_ctx = { 0 };
> +    /*
> +     * array will contain 2 * interval points or less, if
> +     * it was not page fault finalization for page,
> +     * real count will be in ctx.number_of_points
> +     */
> +    dp_ctx.downtime_points = g_ptr_array_new_full(2 * intervals,
> +                                                     destroy_overlap_downtime);

Is the g_ptr_array giving you anything here over a plain-old C array of pointers?
You're not dynamically growing it.

> +    if (!mis->postcopy_downtime) {
> +        goto out;
> +    }
> +
> +#ifdef DEBUG_VCPU_DOWNTIME
> +    {
> +        gint *downtime_cpu = g_new0(int, smp_cpus);
> +        g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
> +        for (point_iter = 0; point_iter < smp_cpus; point_iter++)
> +        {
> +            trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
> +        }
> +        g_free(downtime_cpu);
> +    }
> +#endif /* DEBUG_VCPU_DOWNTIME */

You mgight want to make that:
  if (TRACE_DOWNTIME_PER_CPU_ENABLED) {
  }

and remove the ifdef.

> +    /* make downtime points S/E from interval */
> +    g_tree_foreach(mis->postcopy_downtime, split_duration_and_fill_points,
> +                   &dp_ctx);
> +    g_ptr_array_sort(dp_ctx.downtime_points, compare_downtime);
> +
> +    for (point_iter = 1; point_iter < dp_ctx.number_of_points;
> +         point_iter++) {
> +        OverlapDowntime *od = g_ptr_array_index(dp_ctx.downtime_points,
> +                point_iter);
> +        uint64_t *cur_cpus;
> +        int smp_cpus_i = smp_cpus;
> +        OverlapDowntime *prev_od = g_ptr_array_index(dp_ctx.downtime_points,
> +                                                     point_iter - 1);
> +        if (!od || !prev_od)
> +            continue;

Why would that happen?

> +        /* we need sequence SE */
> +        if (!od->is_end || prev_od->is_end)
> +            continue;
> +
> +        cur_cpus = g_memdup(od->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> +        for (start_point_iter = point_iter - 1;
> +             start_point_iter >= 0 && smp_cpus_i;
> +             start_point_iter--, smp_cpus_i--) {

I think I see what you're doing in this loop, although it's a bit hairy;
I don't think I understand why we needed to get prev_od  if this loop is searching
backwards?

> +            OverlapDowntime *t_od = g_ptr_array_index(dp_ctx.downtime_points,
> +                                                      start_point_iter);
> +            if (!t_od)
> +                break;
> +            /* should be S */
> +            if (t_od->is_end)
> +                break;
> +
> +            /* points were sorted, it's possible when
> +             * end is not occured, but this points were ommited
> +             * in split_duration_and_fill_points */
> +            if (od->tp <= prev_od->tp) {

Why is this checking od and prev_od in this loop - isn't this
loop mainly t_od ?

> +                break;
> +            }
> +
> +            for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> +                cur_cpus[i] |= t_od->cpus[i];
> +            }
> +
> +            /* check_overlap - just count number of bits in cur_cpus,
> +             * and compare it with smp_cpus */
> +            if (check_overlap(cur_cpus)) {
> +                total_downtime += od->tp - prev_od->tp;
> +                /* situation when one S point represents all vCPU is possible */
> +                break;
> +            }
> +        }
> +        g_free(cur_cpus);
> +    }
> +    trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
> +        total_downtime);
> +out:
> +    g_ptr_array_free(dp_ctx.downtime_points, TRUE);
> +    return total_downtime;
> +}
> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> index 70f0480..ea89f4e 100644
> --- a/migration/postcopy-ram.c
> +++ b/migration/postcopy-ram.c
> @@ -23,8 +23,10 @@
>  #include "migration/postcopy-ram.h"
>  #include "sysemu/sysemu.h"
>  #include "sysemu/balloon.h"
> +#include <sys/param.h>
>  #include "qemu/error-report.h"
>  #include "trace.h"
> +#include "glib/glib-helper.h"
>  
>  /* Arbitrary limit on size of each discard command,
>   * keeps them around ~200 bytes
> @@ -81,6 +83,11 @@ static bool ufd_version_check(int ufd, MigrationIncomingState *mis)
>          return false;
>      }
>  
> +    if (mis && UFFD_FEATURE_THREAD_ID & api_struct.features) {

That's a very weird way of writing that test!  Also, I think you need
to still make this user-selectable given the complexity/cost.

> +        mis->postcopy_downtime = g_tree_new_full(g_int_cmp64,
> +                                         NULL, NULL, destroy_downtime_duration);
> +    }
> +
>      if (getpagesize() != ram_pagesize_summary()) {
>          bool have_hp = false;
>          /* We've got a huge page */
> @@ -404,6 +411,18 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
>      return 0;
>  }
>  
> +static int get_mem_fault_cpu_index(uint32_t pid)
> +{
> +    CPUState *cpu_iter;
> +
> +    CPU_FOREACH(cpu_iter) {
> +        if (cpu_iter->thread_id == pid)
> +           return cpu_iter->cpu_index;
> +    }
> +    trace_get_mem_fault_cpu_index(pid);
> +    return -1;
> +}
> +
>  /*
>   * Handle faults detected by the USERFAULT markings
>   */
> @@ -481,8 +500,10 @@ static void *postcopy_ram_fault_thread(void *opaque)
>          rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
>          trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
>                                                  qemu_ram_get_idstr(rb),
> -                                                rb_offset);
> +                                                rb_offset, msg.arg.pagefault.feat.ptid);

Line length!

>  
> +        mark_postcopy_downtime_begin(msg.arg.pagefault.address,
> +                            get_mem_fault_cpu_index(msg.arg.pagefault.feat.ptid));
>          /*
>           * Send the request to the source - we want to request one
>           * of our host page sizes (which is >= TPS)
> @@ -577,6 +598,7 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
>  
>          return -e;
>      }
> +    mark_postcopy_downtime_end((uint64_t)host);
>  
>      trace_postcopy_place_page(host);
>      return 0;
> diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> index 195fa94..c9f3e47 100644
> --- a/migration/qemu-file.c
> +++ b/migration/qemu-file.c
> @@ -547,7 +547,6 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
>  int qemu_peek_byte(QEMUFile *f, int offset)
>  {
>      int index = f->buf_index + offset;
> -

Stray!

>      assert(!qemu_file_is_writable(f));
>      assert(offset < IO_BUF_SIZE);
>  
> diff --git a/migration/trace-events b/migration/trace-events
> index 7372ce2..ab2e1e4 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -110,6 +110,12 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
>  process_incoming_migration_co_postcopy_end_main(void) ""
>  migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
>  migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  "ioc=%p ioctype=%s hostname=%s"
> +mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu) "addr 0x%" PRIx64 " dd %p time %" PRId64 " cpu %d"
> +mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%" PRIx64 " dd %p time %" PRId64
> +get_postcopy_total_downtime(int num, uint64_t total) "faults %d, total downtime %" PRIu64
> +split_duration_and_fill_points(int64_t downtime, uint64_t addr) "downtime %" PRId64 " addr 0x%" PRIx64
> +downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"
> +source_return_path_thread_downtime(uint64_t downtime) "downtime %" PRIu64
>  
>  # migration/rdma.c
>  qemu_rdma_accept_incoming_migration(void) ""
> @@ -186,7 +192,7 @@ postcopy_ram_enable_notify(void) ""
>  postcopy_ram_fault_thread_entry(void) ""
>  postcopy_ram_fault_thread_exit(void) ""
>  postcopy_ram_fault_thread_quit(void) ""
> -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
> +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, int pid) "Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
>  postcopy_ram_incoming_cleanup_closeuf(void) ""
>  postcopy_ram_incoming_cleanup_entry(void) ""
>  postcopy_ram_incoming_cleanup_exit(void) ""
> @@ -195,6 +201,7 @@ save_xbzrle_page_skipping(void) ""
>  save_xbzrle_page_overflow(void) ""
>  ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
>  ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
> +get_mem_fault_cpu_index(uint32_t pid) "pid %u is not vCPU"
>  
>  # migration/exec.c
>  migration_exec_outgoing(const char *cmd) "cmd=%s"
> -- 
> 1.8.3.1
> 

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Alexey Perevalov April 21, 2017, 6:47 p.m. UTC | #2
Hello, David!


I apologize, forgot to check patches with checkpatch.pl script, but now I checked,
and I fixed code styles in patches, however I checked also files,
migration.c has code style errors and glib-compat.h too.
I could send that patches to qemu-trivial, if you not against.


On Fri, Apr 21, 2017 at 01:00:32PM +0100, Dr. David Alan Gilbert wrote:
> * Alexey Perevalov (a.perevalov@samsung.com) wrote:
> > This patch provides downtime calculation per vCPU,
> > as a summary and as a overlapped value for all vCPUs.
> > 
> > This approach just keeps tree with page fault addr as a key,
> > and t1-t2 interval of pagefault time and page copy time, with
> > affected vCPU bit mask.
> > For more implementation details please see comment to
> > get_postcopy_total_downtime function.
> > 
> > Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
> > ---
> >  include/migration/migration.h |  14 +++
> >  migration/migration.c         | 280 +++++++++++++++++++++++++++++++++++++++++-
> >  migration/postcopy-ram.c      |  24 +++-
> >  migration/qemu-file.c         |   1 -
> >  migration/trace-events        |   9 +-
> >  5 files changed, 323 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index 5720c88..5d2c628 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -123,10 +123,24 @@ struct MigrationIncomingState {
> >  
> >      /* See savevm.c */
> >      LoadStateEntry_Head loadvm_handlers;
> > +
> > +    /*
> > +     *  Tree for keeping postcopy downtime,
> > +     *  necessary to calculate correct downtime, during multiple
> > +     *  vm suspends, it keeps host page address as a key and
> > +     *  DowntimeDuration as a data
> > +     *  NULL means kernel couldn't provide process thread id,
> > +     *  and QEMU couldn't identify which vCPU raise page fault
> > +     */
> > +    GTree *postcopy_downtime;
> >  };
> >  
> >  MigrationIncomingState *migration_incoming_get_current(void);
> >  void migration_incoming_state_destroy(void);
> > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
> > +void mark_postcopy_downtime_end(uint64_t addr);
> > +uint64_t get_postcopy_total_downtime(void);
> > +void destroy_downtime_duration(gpointer data);
> >  
> >  /*
> >   * An outstanding page request, on the source, having been received
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 79f6425..5bac434 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -38,6 +38,8 @@
> >  #include "io/channel-tls.h"
> >  #include "migration/colo.h"
> >  
> > +#define DEBUG_VCPU_DOWNTIME 1
> > +
> >  #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
> >  
> >  /* Amount of time to allocate to each "chunk" of bandwidth-throttled
> > @@ -77,6 +79,19 @@ static NotifierList migration_state_notifiers =
> >  
> >  static bool deferred_incoming;
> >  
> > +typedef struct {
> > +    int64_t begin;
> > +    int64_t end;
> > +    uint64_t *cpus; /* cpus bit mask array, QEMU bit functions support
> > +     bit operation on memory regions, but doesn't check out of range */
> > +} DowntimeDuration;
> > +
> > +typedef struct {
> > +    int64_t tp; /* point in time */
> > +    bool is_end;
> > +    uint64_t *cpus;
> > +} OverlapDowntime;
> > +
> >  /*
> >   * Current state of incoming postcopy; note this is not part of
> >   * MigrationIncomingState since it's state is used during cleanup
> > @@ -117,6 +132,13 @@ MigrationState *migrate_get_current(void)
> >      return &current_migration;
> >  }
> >  
> > +void destroy_downtime_duration(gpointer data)
> > +{
> > +    DowntimeDuration *dd = (DowntimeDuration *)data;
> > +    g_free(dd->cpus);
> > +    g_free(data);
> > +}
> > +
> >  MigrationIncomingState *migration_incoming_get_current(void)
> >  {
> >      static bool once;
> > @@ -138,10 +160,13 @@ void migration_incoming_state_destroy(void)
> >      struct MigrationIncomingState *mis = migration_incoming_get_current();
> >  
> >      qemu_event_destroy(&mis->main_thread_load_event);
> > +    if (mis->postcopy_downtime) {
> > +        g_tree_destroy(mis->postcopy_downtime);
> > +        mis->postcopy_downtime = NULL;
> > +    }
> >      loadvm_free_handlers(mis);
> >  }
> >  
> > -
> >  typedef struct {
> >      bool optional;
> >      uint32_t size;
> > @@ -1754,7 +1779,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
> >       */
> >      ms->postcopy_after_devices = true;
> >      notifier_list_notify(&migration_state_notifiers, ms);
> > -
> 
> Stray deletion
> 
> >      ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
> >  
> >      qemu_mutex_unlock_iothread();
> > @@ -2117,3 +2141,255 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
> >      return atomic_xchg(&incoming_postcopy_state, new_state);
> >  }
> >  
> > +#define SIZE_TO_KEEP_CPUBITS (1 + smp_cpus/sizeof(guint64))
> 
> Split out your cpu-sets so that you have an 'alloc_cpu_set',
> a 'set bit' a 'set all bits', dup etc
> (I see Linux has cpumask.h that has a 'cpu_set' that's
> basically the same thing, but we need something portablish.)
> 
> > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    DowntimeDuration *dd;
> > +    if (!mis->postcopy_downtime) {
> > +        return;
> > +    }
> > +
> > +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast */
> > +    if (!dd) {
> > +        dd = (DowntimeDuration *)g_new0(DowntimeDuration, 1);
> > +        dd->cpus = g_new0(guint64, SIZE_TO_KEEP_CPUBITS);
> > +        g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
> > +    }
> > +
> > +    if (cpu < 0) {
> > +        /* assume in this situation all vCPUs are sleeping */
> > +        int i;
> > +        for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> > +            dd->cpus[i] = ~(uint64_t)0u;
> > +        }
> > +    } else
> > +        set_bit(cpu, dd->cpus);
> 
> Qemu coding style: Use {}'s even on one line blocks
> 
> > +
> > +    /*
> > +     *  overwrite previously set dd->begin, if that page already was
> > +     *     faulted on another cpu
> > +     */
> > +    dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> 
> OK, so this is making a decision that needs to be documented;
> that is that if one CPU was already paused at time (a), then a second
> CPU we see is paused at time (b), then the time  we record only starts
> at (b) and ignores the time from a..b  - is that the way you want to do it?
Yes, time interval when at least one of vCPU is running isn't counted.

> As I say, it should be documented somewhere; it's probably worth
> adding something to docs/migration.txt about how this measurement works.
> 
> 
> > +    trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
> > +}
> > +
> > +void mark_postcopy_downtime_end(uint64_t addr)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    DowntimeDuration *dd;
> > +    if (!mis->postcopy_downtime) {
> > +        return;
> > +    }
> > +
> > +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
> > +    if (!dd) {
> > +        /* error_report("Could not populate downtime duration completion time \n\
> > +                        There is no downtime duration for 0x%"PRIx64, addr); */
> 
> Error or no error - decide!   Is this happening for pages that arrive before
> they've been requested?
> 
> > +        return;
> > +    }
> > +
> > +    dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +    trace_mark_postcopy_downtime_end(addr, dd, dd->end);
> > +}
> > +
> > +struct downtime_overlay_cxt {
> > +    GPtrArray *downtime_points;
> > +    size_t number_of_points;
> > +};
> 
> Why 'cxt' ? If you mean as an abbreviation to context, then we normally use ctxt.
> 
> > +/*
> > + * This function split each DowntimeDuration, which represents as start/end
> > + * pointand makes a points of it, then fill array with points,
> > + * to sort it in future.
> > + */
> > +static gboolean split_duration_and_fill_points(gpointer key, gpointer value,
> > +                                        gpointer data)
> > +{
> > +    struct downtime_overlay_cxt *ctx = (struct downtime_overlay_cxt *)data;
> > +    DowntimeDuration *dd = (DowntimeDuration *)value;
> > +    GPtrArray *interval = ctx->downtime_points;
> > +    if (dd->begin) {
> > +        OverlapDowntime *od_begin = g_new0(OverlapDowntime, 1);
> > +        od_begin->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > +        od_begin->tp = dd->begin;
> > +        od_begin->is_end = false;
> > +        g_ptr_array_add(interval, od_begin);
> > +        ctx->number_of_points += 1;
> > +    }
> > +
> > +    if (dd->end) {
> > +        OverlapDowntime *od_end = g_new0(OverlapDowntime, 1);
> > +        od_end->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > +        od_end->tp = dd->end;
> > +        od_end->is_end = true;
> > +        g_ptr_array_add(interval, od_end);
> > +        ctx->number_of_points += 1;
> > +    }
> > +
> > +    if (dd->end && dd->begin)
> > +        trace_split_duration_and_fill_points(dd->end - dd->begin, (uint64_t)key);
> 
> again, need {}'s
> 
> > +    return FALSE;
> > +}
> > +
> > +#ifdef DEBUG_VCPU_DOWNTIME
> > +static gboolean calculate_per_cpu(gpointer key, gpointer value,
> > +                                  gpointer data)
> > +{
> > +    int *downtime_cpu = (int *)data;
> > +    DowntimeDuration *dd = (DowntimeDuration *)value;
> > +    int cpu_iter;
> > +    for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
> > +        if (test_bit(cpu_iter, dd->cpus) && dd->end && dd->begin)
> > +            downtime_cpu[cpu_iter] += dd->end - dd->begin;
> > +    }
> > +    return FALSE;
> > +}
> > +#endif /* DEBUG_VCPU_DOWNTIME */
> > +
> > +static gint compare_downtime(gconstpointer a, gconstpointer b)
> > +{
> > +    DowntimeDuration *dda = (DowntimeDuration *)a;
> > +    DowntimeDuration *ddb = (DowntimeDuration *)b;
> > +    return dda->begin - ddb->begin;
> > +}
> > +
> > +static void destroy_overlap_downtime(gpointer data)
> > +{
> > +    OverlapDowntime *od = (OverlapDowntime *)data;
> > +    g_free(od->cpus);
> > +    g_free(data);
> > +}
> > +
> > +static int check_overlap(uint64_t *b)
> > +{
> > +    unsigned long zero_bit = find_first_zero_bit(b, BITS_PER_LONG * SIZE_TO_KEEP_CPUBITS);
> 
> Line's too long.
> 
> > +    return zero_bit >= smp_cpus;
> 
> So this is really 'all cpus are blocked'?
yes, that condition for it
> 
> > +}
> > +
> > +/*
> > + * This function calculates downtime per cpu and trace it
> > + *
> > + *  Also it calculates total downtime as an interval's overlap,
> > + *  for many vCPU.
> > + *
> > + *  The approach is following:
> > + *  Initially intervals are represented in tree where key is
> > + *  pagefault address, and values:
> > + *   begin - page fault time
> > + *   end   - page load time
> > + *   cpus  - bit mask shows affected cpus
> > + *
> > + *  To calculate overlap on all cpus, intervals converted into
> > + *  array of points in time (downtime_points), the size of
> > + *  array is 2 * number of nodes in tree of intervals (2 array
> > + *  elements per one in element of interval).
> > + *  Each element is marked as end (E) or as start (S) of interval.
> > + *  The overlap downtime will be calculated for SE, only in case
> > + *  there is sequence S(0..N)E(M) for every vCPU.
> > + *
> > + * As example we have 3 CPU
> > + *
> > + *      S1        E1           S1               E1
> > + * -----***********------------xxx***************------------------------> CPU1
> > + *
> > + *             S2                E2
> > + * ------------****************xxx---------------------------------------> CPU2
> > + *
> > + *                         S3            E3
> > + * ------------------------****xxx********-------------------------------> CPU3
> > + *
> > + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> > + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
> > + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
>                        ^ typo
> 
> > + * Legend of picture is following: * - means downtime per vCPU
> > + *                                 x - means overlapped downtime
> > + */
> > +uint64_t get_postcopy_total_downtime(void)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    uint64_t total_downtime = 0; /* for total overlapped downtime */
> > +    const int intervals = g_tree_nnodes(mis->postcopy_downtime);
> > +    int point_iter, start_point_iter, i;
> > +    struct downtime_overlay_cxt dp_ctx = { 0 };
> > +    /*
> > +     * array will contain 2 * interval points or less, if
> > +     * it was not page fault finalization for page,
> > +     * real count will be in ctx.number_of_points
> > +     */
> > +    dp_ctx.downtime_points = g_ptr_array_new_full(2 * intervals,
> > +                                                     destroy_overlap_downtime);
> 
> Is the g_ptr_array giving you anything here over a plain-old C array of pointers?
> You're not dynamically growing it.
Yes, I know upper bound of that array, at that time, and GPtrArray maybe
is little bit heavy structure here. Ok I'll use plain array.

> 
> > +    if (!mis->postcopy_downtime) {
> > +        goto out;
> > +    }
> > +
> > +#ifdef DEBUG_VCPU_DOWNTIME
> > +    {
> > +        gint *downtime_cpu = g_new0(int, smp_cpus);
> > +        g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
> > +        for (point_iter = 0; point_iter < smp_cpus; point_iter++)
> > +        {
> > +            trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
> > +        }
> > +        g_free(downtime_cpu);
> > +    }
> > +#endif /* DEBUG_VCPU_DOWNTIME */
> 
> You mgight want to make that:
>   if (TRACE_DOWNTIME_PER_CPU_ENABLED) {
>   }
> 
> and remove the ifdef.
> 
> > +    /* make downtime points S/E from interval */
> > +    g_tree_foreach(mis->postcopy_downtime, split_duration_and_fill_points,
> > +                   &dp_ctx);
> > +    g_ptr_array_sort(dp_ctx.downtime_points, compare_downtime);
> > +
> > +    for (point_iter = 1; point_iter < dp_ctx.number_of_points;
> > +         point_iter++) {
> > +        OverlapDowntime *od = g_ptr_array_index(dp_ctx.downtime_points,
> > +                point_iter);
> > +        uint64_t *cur_cpus;
> > +        int smp_cpus_i = smp_cpus;
> > +        OverlapDowntime *prev_od = g_ptr_array_index(dp_ctx.downtime_points,
> > +                                                     point_iter - 1);
> > +        if (!od || !prev_od)
> > +            continue;
> 
> Why would that happen?
Now cycle goes till dp_ctx.number_of_points, so in this version it looks
impossible.
> 
> > +        /* we need sequence SE */
> > +        if (!od->is_end || prev_od->is_end)
> > +            continue;
> > +
> > +        cur_cpus = g_memdup(od->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > +        for (start_point_iter = point_iter - 1;
> > +             start_point_iter >= 0 && smp_cpus_i;
> > +             start_point_iter--, smp_cpus_i--) {
> 
> I think I see what you're doing in this loop, although it's a bit hairy;
> I don't think I understand why we needed to get prev_od  if this loop is searching
> backwards?
Just for condition,
if (!od->is_end || prev_od->is_end) {
    continue;
}
to skip any other sequences, like EE,
do you think following condition more readable?
!(od->is_end && !prev_od->is_end) 
    continue;

Also prev_od is  nearest point to end, so time since that point to end
is interesting.
I depicted that.
> 
> > +            OverlapDowntime *t_od = g_ptr_array_index(dp_ctx.downtime_points,
> > +                                                      start_point_iter);
> > +            if (!t_od)
> > +                break;
> > +            /* should be S */
> > +            if (t_od->is_end)
> > +                break;
> > +
> > +            /* points were sorted, it's possible when
> > +             * end is not occured, but this points were ommited
> > +             * in split_duration_and_fill_points */
> > +            if (od->tp <= prev_od->tp) {
> 
> Why is this checking od and prev_od in this loop - isn't this
> loop mainly t_od ?
right, that code shouldn't be here.
> 
> > +                break;
> > +            }
> > +
> > +            for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> > +                cur_cpus[i] |= t_od->cpus[i];
> > +            }
> > +
> > +            /* check_overlap - just count number of bits in cur_cpus,
> > +             * and compare it with smp_cpus */
> > +            if (check_overlap(cur_cpus)) {
> > +                total_downtime += od->tp - prev_od->tp;
> > +                /* situation when one S point represents all vCPU is possible */
> > +                break;
> > +            }
> > +        }
> > +        g_free(cur_cpus);
> > +    }
> > +    trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
> > +        total_downtime);
> > +out:
> > +    g_ptr_array_free(dp_ctx.downtime_points, TRUE);
> > +    return total_downtime;
> > +}
> > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> > index 70f0480..ea89f4e 100644
> > --- a/migration/postcopy-ram.c
> > +++ b/migration/postcopy-ram.c
> > @@ -23,8 +23,10 @@
> >  #include "migration/postcopy-ram.h"
> >  #include "sysemu/sysemu.h"
> >  #include "sysemu/balloon.h"
> > +#include <sys/param.h>
> >  #include "qemu/error-report.h"
> >  #include "trace.h"
> > +#include "glib/glib-helper.h"
> >  
> >  /* Arbitrary limit on size of each discard command,
> >   * keeps them around ~200 bytes
> > @@ -81,6 +83,11 @@ static bool ufd_version_check(int ufd, MigrationIncomingState *mis)
> >          return false;
> >      }
> >  
> > +    if (mis && UFFD_FEATURE_THREAD_ID & api_struct.features) {
> 
> That's a very weird way of writing that test!  Also, I think you need
> to still make this user-selectable given the complexity/cost.
>
Like that?
{"execute": "migrate-set-capabilities" , "arguments":
{ "capabilities": [ { "capability": "calculate-postcopy-downtime", "state": true } ]
} }                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^

I tried to put heavy operations much after hot path (page requesting and
copying). The algorithm complexity is NumberOfPage*NumberOfvCPU, in case
of hugepages it's not so many.
Ok, if it's conditionally obtained from kernel, why not to give a user
ability to choose.

> > +        mis->postcopy_downtime = g_tree_new_full(g_int_cmp64,
> > +                                         NULL, NULL, destroy_downtime_duration);
> > +    }
> > +
> >      if (getpagesize() != ram_pagesize_summary()) {
> >          bool have_hp = false;
> >          /* We've got a huge page */
> > @@ -404,6 +411,18 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
> >      return 0;
> >  }
> >  
> > +static int get_mem_fault_cpu_index(uint32_t pid)
> > +{
> > +    CPUState *cpu_iter;
> > +
> > +    CPU_FOREACH(cpu_iter) {
> > +        if (cpu_iter->thread_id == pid)
> > +           return cpu_iter->cpu_index;
> > +    }
> > +    trace_get_mem_fault_cpu_index(pid);
> > +    return -1;
> > +}
> > +
> >  /*
> >   * Handle faults detected by the USERFAULT markings
> >   */
> > @@ -481,8 +500,10 @@ static void *postcopy_ram_fault_thread(void *opaque)
> >          rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> >          trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> >                                                  qemu_ram_get_idstr(rb),
> > -                                                rb_offset);
> > +                                                rb_offset, msg.arg.pagefault.feat.ptid);
> 
> Line length!
> 
> >  
> > +        mark_postcopy_downtime_begin(msg.arg.pagefault.address,
> > +                            get_mem_fault_cpu_index(msg.arg.pagefault.feat.ptid));
> >          /*
> >           * Send the request to the source - we want to request one
> >           * of our host page sizes (which is >= TPS)
> > @@ -577,6 +598,7 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
> >  
> >          return -e;
> >      }
> > +    mark_postcopy_downtime_end((uint64_t)host);
> >  
> >      trace_postcopy_place_page(host);
> >      return 0;
> > diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> > index 195fa94..c9f3e47 100644
> > --- a/migration/qemu-file.c
> > +++ b/migration/qemu-file.c
> > @@ -547,7 +547,6 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
> >  int qemu_peek_byte(QEMUFile *f, int offset)
> >  {
> >      int index = f->buf_index + offset;
> > -
> 
> Stray!
> 
> >      assert(!qemu_file_is_writable(f));
> >      assert(offset < IO_BUF_SIZE);
> >  
> > diff --git a/migration/trace-events b/migration/trace-events
> > index 7372ce2..ab2e1e4 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -110,6 +110,12 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
> >  process_incoming_migration_co_postcopy_end_main(void) ""
> >  migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
> >  migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  "ioc=%p ioctype=%s hostname=%s"
> > +mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu) "addr 0x%" PRIx64 " dd %p time %" PRId64 " cpu %d"
> > +mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%" PRIx64 " dd %p time %" PRId64
> > +get_postcopy_total_downtime(int num, uint64_t total) "faults %d, total downtime %" PRIu64
> > +split_duration_and_fill_points(int64_t downtime, uint64_t addr) "downtime %" PRId64 " addr 0x%" PRIx64
> > +downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"
> > +source_return_path_thread_downtime(uint64_t downtime) "downtime %" PRIu64
> >  
> >  # migration/rdma.c
> >  qemu_rdma_accept_incoming_migration(void) ""
> > @@ -186,7 +192,7 @@ postcopy_ram_enable_notify(void) ""
> >  postcopy_ram_fault_thread_entry(void) ""
> >  postcopy_ram_fault_thread_exit(void) ""
> >  postcopy_ram_fault_thread_quit(void) ""
> > -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
> > +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, int pid) "Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
> >  postcopy_ram_incoming_cleanup_closeuf(void) ""
> >  postcopy_ram_incoming_cleanup_entry(void) ""
> >  postcopy_ram_incoming_cleanup_exit(void) ""
> > @@ -195,6 +201,7 @@ save_xbzrle_page_skipping(void) ""
> >  save_xbzrle_page_overflow(void) ""
> >  ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
> >  ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
> > +get_mem_fault_cpu_index(uint32_t pid) "pid %u is not vCPU"
> >  
> >  # migration/exec.c
> >  migration_exec_outgoing(const char *cmd) "cmd=%s"
> > -- 
> > 1.8.3.1
> > 
> 
> Dave
> 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Alexey Perevalov April 22, 2017, 9:49 a.m. UTC | #3
Hello David,
this mail just for CPUMASK discussion.

On Fri, Apr 21, 2017 at 01:00:32PM +0100, Dr. David Alan Gilbert wrote:
> * Alexey Perevalov (a.perevalov@samsung.com) wrote:
> > This patch provides downtime calculation per vCPU,
> > as a summary and as a overlapped value for all vCPUs.
> > 
> > This approach just keeps tree with page fault addr as a key,
> > and t1-t2 interval of pagefault time and page copy time, with
> > affected vCPU bit mask.
> > For more implementation details please see comment to
> > get_postcopy_total_downtime function.
> > 
> > Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
> > ---
> >  include/migration/migration.h |  14 +++
> >  migration/migration.c         | 280 +++++++++++++++++++++++++++++++++++++++++-
> >  migration/postcopy-ram.c      |  24 +++-
> >  migration/qemu-file.c         |   1 -
> >  migration/trace-events        |   9 +-
> >  5 files changed, 323 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index 5720c88..5d2c628 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -123,10 +123,24 @@ struct MigrationIncomingState {
> >  
> >      /* See savevm.c */
> >      LoadStateEntry_Head loadvm_handlers;
> > +
> > +    /*
> > +     *  Tree for keeping postcopy downtime,
> > +     *  necessary to calculate correct downtime, during multiple
> > +     *  vm suspends, it keeps host page address as a key and
> > +     *  DowntimeDuration as a data
> > +     *  NULL means kernel couldn't provide process thread id,
> > +     *  and QEMU couldn't identify which vCPU raise page fault
> > +     */
> > +    GTree *postcopy_downtime;
> >  };
> >  
> >  MigrationIncomingState *migration_incoming_get_current(void);
> >  void migration_incoming_state_destroy(void);
> > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
> > +void mark_postcopy_downtime_end(uint64_t addr);
> > +uint64_t get_postcopy_total_downtime(void);
> > +void destroy_downtime_duration(gpointer data);
> >  
> >  /*
> >   * An outstanding page request, on the source, having been received
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 79f6425..5bac434 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -38,6 +38,8 @@
> >  #include "io/channel-tls.h"
> >  #include "migration/colo.h"
> >  
> > +#define DEBUG_VCPU_DOWNTIME 1
> > +
> >  #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
> >  
> >  /* Amount of time to allocate to each "chunk" of bandwidth-throttled
> > @@ -77,6 +79,19 @@ static NotifierList migration_state_notifiers =
> >  
> >  static bool deferred_incoming;
> >  
> > +typedef struct {
> > +    int64_t begin;
> > +    int64_t end;
> > +    uint64_t *cpus; /* cpus bit mask array, QEMU bit functions support
> > +     bit operation on memory regions, but doesn't check out of range */
> > +} DowntimeDuration;
> > +
> > +typedef struct {
> > +    int64_t tp; /* point in time */
> > +    bool is_end;
> > +    uint64_t *cpus;
> > +} OverlapDowntime;
> > +
> >  /*
> >   * Current state of incoming postcopy; note this is not part of
> >   * MigrationIncomingState since it's state is used during cleanup
> > @@ -117,6 +132,13 @@ MigrationState *migrate_get_current(void)
> >      return &current_migration;
> >  }
> >  
> > +void destroy_downtime_duration(gpointer data)
> > +{
> > +    DowntimeDuration *dd = (DowntimeDuration *)data;
> > +    g_free(dd->cpus);
> > +    g_free(data);
> > +}
> > +
> >  MigrationIncomingState *migration_incoming_get_current(void)
> >  {
> >      static bool once;
> > @@ -138,10 +160,13 @@ void migration_incoming_state_destroy(void)
> >      struct MigrationIncomingState *mis = migration_incoming_get_current();
> >  
> >      qemu_event_destroy(&mis->main_thread_load_event);
> > +    if (mis->postcopy_downtime) {
> > +        g_tree_destroy(mis->postcopy_downtime);
> > +        mis->postcopy_downtime = NULL;
> > +    }
> >      loadvm_free_handlers(mis);
> >  }
> >  
> > -
> >  typedef struct {
> >      bool optional;
> >      uint32_t size;
> > @@ -1754,7 +1779,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
> >       */
> >      ms->postcopy_after_devices = true;
> >      notifier_list_notify(&migration_state_notifiers, ms);
> > -
> 
> Stray deletion
> 
> >      ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
> >  
> >      qemu_mutex_unlock_iothread();
> > @@ -2117,3 +2141,255 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
> >      return atomic_xchg(&incoming_postcopy_state, new_state);
> >  }
> >  
> > +#define SIZE_TO_KEEP_CPUBITS (1 + smp_cpus/sizeof(guint64))
> 
> Split out your cpu-sets so that you have an 'alloc_cpu_set',
> a 'set bit' a 'set all bits', dup etc
> (I see Linux has cpumask.h that has a 'cpu_set' that's
> basically the same thing, but we need something portablish.)
> 
Agree, the way I'm working with cpumask is little bit naive.
instead of set all_cpumask in case when all vCPU are sleeping with precision
((1 << smp_cpus) - 1), I just set ~0 it all, because I didn't use
functions like cpumask_and.
If you think, this patch should use cpumask, cpumask patchset/separate
thread should be introduced before, and then this patchset should be
rebased on top of it.


> > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    DowntimeDuration *dd;
> > +    if (!mis->postcopy_downtime) {
> > +        return;
> > +    }
> > +
> > +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast */
> > +    if (!dd) {
> > +        dd = (DowntimeDuration *)g_new0(DowntimeDuration, 1);
> > +        dd->cpus = g_new0(guint64, SIZE_TO_KEEP_CPUBITS);
> > +        g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
> > +    }
> > +
> > +    if (cpu < 0) {
> > +        /* assume in this situation all vCPUs are sleeping */
> > +        int i;
> > +        for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> > +            dd->cpus[i] = ~(uint64_t)0u;
> > +        }
> > +    } else
> > +        set_bit(cpu, dd->cpus);
> 
> Qemu coding style: Use {}'s even on one line blocks
> 
> > +
> > +    /*
> > +     *  overwrite previously set dd->begin, if that page already was
> > +     *     faulted on another cpu
> > +     */
> > +    dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> 
> OK, so this is making a decision that needs to be documented;
> that is that if one CPU was already paused at time (a), then a second
> CPU we see is paused at time (b), then the time  we record only starts
> at (b) and ignores the time from a..b  - is that the way you want to do it?
> As I say, it should be documented somewhere; it's probably worth
> adding something to docs/migration.txt about how this measurement works.
> 
> 
> > +    trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
> > +}
> > +
> > +void mark_postcopy_downtime_end(uint64_t addr)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    DowntimeDuration *dd;
> > +    if (!mis->postcopy_downtime) {
> > +        return;
> > +    }
> > +
> > +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
> > +    if (!dd) {
> > +        /* error_report("Could not populate downtime duration completion time \n\
> > +                        There is no downtime duration for 0x%"PRIx64, addr); */
> 
> Error or no error - decide!   Is this happening for pages that arrive before
> they've been requested?
> 
> > +        return;
> > +    }
> > +
> > +    dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +    trace_mark_postcopy_downtime_end(addr, dd, dd->end);
> > +}
> > +
> > +struct downtime_overlay_cxt {
> > +    GPtrArray *downtime_points;
> > +    size_t number_of_points;
> > +};
> 
> Why 'cxt' ? If you mean as an abbreviation to context, then we normally use ctxt.
> 
> > +/*
> > + * This function split each DowntimeDuration, which represents as start/end
> > + * pointand makes a points of it, then fill array with points,
> > + * to sort it in future.
> > + */
> > +static gboolean split_duration_and_fill_points(gpointer key, gpointer value,
> > +                                        gpointer data)
> > +{
> > +    struct downtime_overlay_cxt *ctx = (struct downtime_overlay_cxt *)data;
> > +    DowntimeDuration *dd = (DowntimeDuration *)value;
> > +    GPtrArray *interval = ctx->downtime_points;
> > +    if (dd->begin) {
> > +        OverlapDowntime *od_begin = g_new0(OverlapDowntime, 1);
> > +        od_begin->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > +        od_begin->tp = dd->begin;
> > +        od_begin->is_end = false;
> > +        g_ptr_array_add(interval, od_begin);
> > +        ctx->number_of_points += 1;
> > +    }
> > +
> > +    if (dd->end) {
> > +        OverlapDowntime *od_end = g_new0(OverlapDowntime, 1);
> > +        od_end->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > +        od_end->tp = dd->end;
> > +        od_end->is_end = true;
> > +        g_ptr_array_add(interval, od_end);
> > +        ctx->number_of_points += 1;
> > +    }
> > +
> > +    if (dd->end && dd->begin)
> > +        trace_split_duration_and_fill_points(dd->end - dd->begin, (uint64_t)key);
> 
> again, need {}'s
> 
> > +    return FALSE;
> > +}
> > +
> > +#ifdef DEBUG_VCPU_DOWNTIME
> > +static gboolean calculate_per_cpu(gpointer key, gpointer value,
> > +                                  gpointer data)
> > +{
> > +    int *downtime_cpu = (int *)data;
> > +    DowntimeDuration *dd = (DowntimeDuration *)value;
> > +    int cpu_iter;
> > +    for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
> > +        if (test_bit(cpu_iter, dd->cpus) && dd->end && dd->begin)
> > +            downtime_cpu[cpu_iter] += dd->end - dd->begin;
> > +    }
> > +    return FALSE;
> > +}
> > +#endif /* DEBUG_VCPU_DOWNTIME */
> > +
> > +static gint compare_downtime(gconstpointer a, gconstpointer b)
> > +{
> > +    DowntimeDuration *dda = (DowntimeDuration *)a;
> > +    DowntimeDuration *ddb = (DowntimeDuration *)b;
> > +    return dda->begin - ddb->begin;
> > +}
> > +
> > +static void destroy_overlap_downtime(gpointer data)
> > +{
> > +    OverlapDowntime *od = (OverlapDowntime *)data;
> > +    g_free(od->cpus);
> > +    g_free(data);
> > +}
> > +
> > +static int check_overlap(uint64_t *b)
> > +{
> > +    unsigned long zero_bit = find_first_zero_bit(b, BITS_PER_LONG * SIZE_TO_KEEP_CPUBITS);
> 
> Line's too long.
> 
> > +    return zero_bit >= smp_cpus;
> 
> So this is really 'all cpus are blocked'?
> 
> > +}
> > +
> > +/*
> > + * This function calculates downtime per cpu and trace it
> > + *
> > + *  Also it calculates total downtime as an interval's overlap,
> > + *  for many vCPU.
> > + *
> > + *  The approach is following:
> > + *  Initially intervals are represented in tree where key is
> > + *  pagefault address, and values:
> > + *   begin - page fault time
> > + *   end   - page load time
> > + *   cpus  - bit mask shows affected cpus
> > + *
> > + *  To calculate overlap on all cpus, intervals converted into
> > + *  array of points in time (downtime_points), the size of
> > + *  array is 2 * number of nodes in tree of intervals (2 array
> > + *  elements per one in element of interval).
> > + *  Each element is marked as end (E) or as start (S) of interval.
> > + *  The overlap downtime will be calculated for SE, only in case
> > + *  there is sequence S(0..N)E(M) for every vCPU.
> > + *
> > + * As example we have 3 CPU
> > + *
> > + *      S1        E1           S1               E1
> > + * -----***********------------xxx***************------------------------> CPU1
> > + *
> > + *             S2                E2
> > + * ------------****************xxx---------------------------------------> CPU2
> > + *
> > + *                         S3            E3
> > + * ------------------------****xxx********-------------------------------> CPU3
> > + *
> > + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> > + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
> > + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
>                        ^ typo
> 
> > + * Legend of picture is following: * - means downtime per vCPU
> > + *                                 x - means overlapped downtime
> > + */
> > +uint64_t get_postcopy_total_downtime(void)
> > +{
> > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > +    uint64_t total_downtime = 0; /* for total overlapped downtime */
> > +    const int intervals = g_tree_nnodes(mis->postcopy_downtime);
> > +    int point_iter, start_point_iter, i;
> > +    struct downtime_overlay_cxt dp_ctx = { 0 };
> > +    /*
> > +     * array will contain 2 * interval points or less, if
> > +     * it was not page fault finalization for page,
> > +     * real count will be in ctx.number_of_points
> > +     */
> > +    dp_ctx.downtime_points = g_ptr_array_new_full(2 * intervals,
> > +                                                     destroy_overlap_downtime);
> 
> Is the g_ptr_array giving you anything here over a plain-old C array of pointers?
> You're not dynamically growing it.
> 
> > +    if (!mis->postcopy_downtime) {
> > +        goto out;
> > +    }
> > +
> > +#ifdef DEBUG_VCPU_DOWNTIME
> > +    {
> > +        gint *downtime_cpu = g_new0(int, smp_cpus);
> > +        g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
> > +        for (point_iter = 0; point_iter < smp_cpus; point_iter++)
> > +        {
> > +            trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
> > +        }
> > +        g_free(downtime_cpu);
> > +    }
> > +#endif /* DEBUG_VCPU_DOWNTIME */
> 
> You mgight want to make that:
>   if (TRACE_DOWNTIME_PER_CPU_ENABLED) {
>   }
> 
> and remove the ifdef.
> 
> > +    /* make downtime points S/E from interval */
> > +    g_tree_foreach(mis->postcopy_downtime, split_duration_and_fill_points,
> > +                   &dp_ctx);
> > +    g_ptr_array_sort(dp_ctx.downtime_points, compare_downtime);
> > +
> > +    for (point_iter = 1; point_iter < dp_ctx.number_of_points;
> > +         point_iter++) {
> > +        OverlapDowntime *od = g_ptr_array_index(dp_ctx.downtime_points,
> > +                point_iter);
> > +        uint64_t *cur_cpus;
> > +        int smp_cpus_i = smp_cpus;
> > +        OverlapDowntime *prev_od = g_ptr_array_index(dp_ctx.downtime_points,
> > +                                                     point_iter - 1);
> > +        if (!od || !prev_od)
> > +            continue;
> 
> Why would that happen?
> 
> > +        /* we need sequence SE */
> > +        if (!od->is_end || prev_od->is_end)
> > +            continue;
> > +
> > +        cur_cpus = g_memdup(od->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > +        for (start_point_iter = point_iter - 1;
> > +             start_point_iter >= 0 && smp_cpus_i;
> > +             start_point_iter--, smp_cpus_i--) {
> 
> I think I see what you're doing in this loop, although it's a bit hairy;
> I don't think I understand why we needed to get prev_od  if this loop is searching
> backwards?
> 
> > +            OverlapDowntime *t_od = g_ptr_array_index(dp_ctx.downtime_points,
> > +                                                      start_point_iter);
> > +            if (!t_od)
> > +                break;
> > +            /* should be S */
> > +            if (t_od->is_end)
> > +                break;
> > +
> > +            /* points were sorted, it's possible when
> > +             * end is not occured, but this points were ommited
> > +             * in split_duration_and_fill_points */
> > +            if (od->tp <= prev_od->tp) {
> 
> Why is this checking od and prev_od in this loop - isn't this
> loop mainly t_od ?
> 
> > +                break;
> > +            }
> > +
> > +            for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> > +                cur_cpus[i] |= t_od->cpus[i];
> > +            }
> > +
> > +            /* check_overlap - just count number of bits in cur_cpus,
> > +             * and compare it with smp_cpus */
> > +            if (check_overlap(cur_cpus)) {
> > +                total_downtime += od->tp - prev_od->tp;
> > +                /* situation when one S point represents all vCPU is possible */
> > +                break;
> > +            }
> > +        }
> > +        g_free(cur_cpus);
> > +    }
> > +    trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
> > +        total_downtime);
> > +out:
> > +    g_ptr_array_free(dp_ctx.downtime_points, TRUE);
> > +    return total_downtime;
> > +}
> > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> > index 70f0480..ea89f4e 100644
> > --- a/migration/postcopy-ram.c
> > +++ b/migration/postcopy-ram.c
> > @@ -23,8 +23,10 @@
> >  #include "migration/postcopy-ram.h"
> >  #include "sysemu/sysemu.h"
> >  #include "sysemu/balloon.h"
> > +#include <sys/param.h>
> >  #include "qemu/error-report.h"
> >  #include "trace.h"
> > +#include "glib/glib-helper.h"
> >  
> >  /* Arbitrary limit on size of each discard command,
> >   * keeps them around ~200 bytes
> > @@ -81,6 +83,11 @@ static bool ufd_version_check(int ufd, MigrationIncomingState *mis)
> >          return false;
> >      }
> >  
> > +    if (mis && UFFD_FEATURE_THREAD_ID & api_struct.features) {
> 
> That's a very weird way of writing that test!  Also, I think you need
> to still make this user-selectable given the complexity/cost.
> 
> > +        mis->postcopy_downtime = g_tree_new_full(g_int_cmp64,
> > +                                         NULL, NULL, destroy_downtime_duration);
> > +    }
> > +
> >      if (getpagesize() != ram_pagesize_summary()) {
> >          bool have_hp = false;
> >          /* We've got a huge page */
> > @@ -404,6 +411,18 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
> >      return 0;
> >  }
> >  
> > +static int get_mem_fault_cpu_index(uint32_t pid)
> > +{
> > +    CPUState *cpu_iter;
> > +
> > +    CPU_FOREACH(cpu_iter) {
> > +        if (cpu_iter->thread_id == pid)
> > +           return cpu_iter->cpu_index;
> > +    }
> > +    trace_get_mem_fault_cpu_index(pid);
> > +    return -1;
> > +}
> > +
> >  /*
> >   * Handle faults detected by the USERFAULT markings
> >   */
> > @@ -481,8 +500,10 @@ static void *postcopy_ram_fault_thread(void *opaque)
> >          rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> >          trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> >                                                  qemu_ram_get_idstr(rb),
> > -                                                rb_offset);
> > +                                                rb_offset, msg.arg.pagefault.feat.ptid);
> 
> Line length!
> 
> >  
> > +        mark_postcopy_downtime_begin(msg.arg.pagefault.address,
> > +                            get_mem_fault_cpu_index(msg.arg.pagefault.feat.ptid));
> >          /*
> >           * Send the request to the source - we want to request one
> >           * of our host page sizes (which is >= TPS)
> > @@ -577,6 +598,7 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
> >  
> >          return -e;
> >      }
> > +    mark_postcopy_downtime_end((uint64_t)host);
> >  
> >      trace_postcopy_place_page(host);
> >      return 0;
> > diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> > index 195fa94..c9f3e47 100644
> > --- a/migration/qemu-file.c
> > +++ b/migration/qemu-file.c
> > @@ -547,7 +547,6 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
> >  int qemu_peek_byte(QEMUFile *f, int offset)
> >  {
> >      int index = f->buf_index + offset;
> > -
> 
> Stray!
> 
> >      assert(!qemu_file_is_writable(f));
> >      assert(offset < IO_BUF_SIZE);
> >  
> > diff --git a/migration/trace-events b/migration/trace-events
> > index 7372ce2..ab2e1e4 100644
> > --- a/migration/trace-events
> > +++ b/migration/trace-events
> > @@ -110,6 +110,12 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
> >  process_incoming_migration_co_postcopy_end_main(void) ""
> >  migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
> >  migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  "ioc=%p ioctype=%s hostname=%s"
> > +mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu) "addr 0x%" PRIx64 " dd %p time %" PRId64 " cpu %d"
> > +mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%" PRIx64 " dd %p time %" PRId64
> > +get_postcopy_total_downtime(int num, uint64_t total) "faults %d, total downtime %" PRIu64
> > +split_duration_and_fill_points(int64_t downtime, uint64_t addr) "downtime %" PRId64 " addr 0x%" PRIx64
> > +downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"
> > +source_return_path_thread_downtime(uint64_t downtime) "downtime %" PRIu64
> >  
> >  # migration/rdma.c
> >  qemu_rdma_accept_incoming_migration(void) ""
> > @@ -186,7 +192,7 @@ postcopy_ram_enable_notify(void) ""
> >  postcopy_ram_fault_thread_entry(void) ""
> >  postcopy_ram_fault_thread_exit(void) ""
> >  postcopy_ram_fault_thread_quit(void) ""
> > -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
> > +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, int pid) "Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
> >  postcopy_ram_incoming_cleanup_closeuf(void) ""
> >  postcopy_ram_incoming_cleanup_entry(void) ""
> >  postcopy_ram_incoming_cleanup_exit(void) ""
> > @@ -195,6 +201,7 @@ save_xbzrle_page_skipping(void) ""
> >  save_xbzrle_page_overflow(void) ""
> >  ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
> >  ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
> > +get_mem_fault_cpu_index(uint32_t pid) "pid %u is not vCPU"
> >  
> >  # migration/exec.c
> >  migration_exec_outgoing(const char *cmd) "cmd=%s"
> > -- 
> > 1.8.3.1
> > 
> 
> Dave
> 
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
Dr. David Alan Gilbert April 24, 2017, 5:11 p.m. UTC | #4
* Alexey (a.perevalov@samsung.com) wrote:
> Hello, David!
> 
> 
> I apologize, forgot to check patches with checkpatch.pl script, but now I checked,
> and I fixed code styles in patches, however I checked also files,
> migration.c has code style errors and glib-compat.h too.
> I could send that patches to qemu-trivial, if you not against.

Feel free to send style patches to trivial;  if they're right next
to a line you're changing then you can include them in the same patch
but if they're elsewhere do as you say with a trivial patch.

Dave

> 
> On Fri, Apr 21, 2017 at 01:00:32PM +0100, Dr. David Alan Gilbert wrote:
> > * Alexey Perevalov (a.perevalov@samsung.com) wrote:
> > > This patch provides downtime calculation per vCPU,
> > > as a summary and as a overlapped value for all vCPUs.
> > > 
> > > This approach just keeps tree with page fault addr as a key,
> > > and t1-t2 interval of pagefault time and page copy time, with
> > > affected vCPU bit mask.
> > > For more implementation details please see comment to
> > > get_postcopy_total_downtime function.
> > > 
> > > Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
> > > ---
> > >  include/migration/migration.h |  14 +++
> > >  migration/migration.c         | 280 +++++++++++++++++++++++++++++++++++++++++-
> > >  migration/postcopy-ram.c      |  24 +++-
> > >  migration/qemu-file.c         |   1 -
> > >  migration/trace-events        |   9 +-
> > >  5 files changed, 323 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > > index 5720c88..5d2c628 100644
> > > --- a/include/migration/migration.h
> > > +++ b/include/migration/migration.h
> > > @@ -123,10 +123,24 @@ struct MigrationIncomingState {
> > >  
> > >      /* See savevm.c */
> > >      LoadStateEntry_Head loadvm_handlers;
> > > +
> > > +    /*
> > > +     *  Tree for keeping postcopy downtime,
> > > +     *  necessary to calculate correct downtime, during multiple
> > > +     *  vm suspends, it keeps host page address as a key and
> > > +     *  DowntimeDuration as a data
> > > +     *  NULL means kernel couldn't provide process thread id,
> > > +     *  and QEMU couldn't identify which vCPU raise page fault
> > > +     */
> > > +    GTree *postcopy_downtime;
> > >  };
> > >  
> > >  MigrationIncomingState *migration_incoming_get_current(void);
> > >  void migration_incoming_state_destroy(void);
> > > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
> > > +void mark_postcopy_downtime_end(uint64_t addr);
> > > +uint64_t get_postcopy_total_downtime(void);
> > > +void destroy_downtime_duration(gpointer data);
> > >  
> > >  /*
> > >   * An outstanding page request, on the source, having been received
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index 79f6425..5bac434 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -38,6 +38,8 @@
> > >  #include "io/channel-tls.h"
> > >  #include "migration/colo.h"
> > >  
> > > +#define DEBUG_VCPU_DOWNTIME 1
> > > +
> > >  #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
> > >  
> > >  /* Amount of time to allocate to each "chunk" of bandwidth-throttled
> > > @@ -77,6 +79,19 @@ static NotifierList migration_state_notifiers =
> > >  
> > >  static bool deferred_incoming;
> > >  
> > > +typedef struct {
> > > +    int64_t begin;
> > > +    int64_t end;
> > > +    uint64_t *cpus; /* cpus bit mask array, QEMU bit functions support
> > > +     bit operation on memory regions, but doesn't check out of range */
> > > +} DowntimeDuration;
> > > +
> > > +typedef struct {
> > > +    int64_t tp; /* point in time */
> > > +    bool is_end;
> > > +    uint64_t *cpus;
> > > +} OverlapDowntime;
> > > +
> > >  /*
> > >   * Current state of incoming postcopy; note this is not part of
> > >   * MigrationIncomingState since it's state is used during cleanup
> > > @@ -117,6 +132,13 @@ MigrationState *migrate_get_current(void)
> > >      return &current_migration;
> > >  }
> > >  
> > > +void destroy_downtime_duration(gpointer data)
> > > +{
> > > +    DowntimeDuration *dd = (DowntimeDuration *)data;
> > > +    g_free(dd->cpus);
> > > +    g_free(data);
> > > +}
> > > +
> > >  MigrationIncomingState *migration_incoming_get_current(void)
> > >  {
> > >      static bool once;
> > > @@ -138,10 +160,13 @@ void migration_incoming_state_destroy(void)
> > >      struct MigrationIncomingState *mis = migration_incoming_get_current();
> > >  
> > >      qemu_event_destroy(&mis->main_thread_load_event);
> > > +    if (mis->postcopy_downtime) {
> > > +        g_tree_destroy(mis->postcopy_downtime);
> > > +        mis->postcopy_downtime = NULL;
> > > +    }
> > >      loadvm_free_handlers(mis);
> > >  }
> > >  
> > > -
> > >  typedef struct {
> > >      bool optional;
> > >      uint32_t size;
> > > @@ -1754,7 +1779,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
> > >       */
> > >      ms->postcopy_after_devices = true;
> > >      notifier_list_notify(&migration_state_notifiers, ms);
> > > -
> > 
> > Stray deletion
> > 
> > >      ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
> > >  
> > >      qemu_mutex_unlock_iothread();
> > > @@ -2117,3 +2141,255 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
> > >      return atomic_xchg(&incoming_postcopy_state, new_state);
> > >  }
> > >  
> > > +#define SIZE_TO_KEEP_CPUBITS (1 + smp_cpus/sizeof(guint64))
> > 
> > Split out your cpu-sets so that you have an 'alloc_cpu_set',
> > a 'set bit' a 'set all bits', dup etc
> > (I see Linux has cpumask.h that has a 'cpu_set' that's
> > basically the same thing, but we need something portablish.)
> > 
> > > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu)
> > > +{
> > > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > > +    DowntimeDuration *dd;
> > > +    if (!mis->postcopy_downtime) {
> > > +        return;
> > > +    }
> > > +
> > > +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast */
> > > +    if (!dd) {
> > > +        dd = (DowntimeDuration *)g_new0(DowntimeDuration, 1);
> > > +        dd->cpus = g_new0(guint64, SIZE_TO_KEEP_CPUBITS);
> > > +        g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
> > > +    }
> > > +
> > > +    if (cpu < 0) {
> > > +        /* assume in this situation all vCPUs are sleeping */
> > > +        int i;
> > > +        for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> > > +            dd->cpus[i] = ~(uint64_t)0u;
> > > +        }
> > > +    } else
> > > +        set_bit(cpu, dd->cpus);
> > 
> > Qemu coding style: Use {}'s even on one line blocks
> > 
> > > +
> > > +    /*
> > > +     *  overwrite previously set dd->begin, if that page already was
> > > +     *     faulted on another cpu
> > > +     */
> > > +    dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > 
> > OK, so this is making a decision that needs to be documented;
> > that is that if one CPU was already paused at time (a), then a second
> > CPU we see is paused at time (b), then the time  we record only starts
> > at (b) and ignores the time from a..b  - is that the way you want to do it?
> Yes, time interval when at least one of vCPU is running isn't counted.
> 
> > As I say, it should be documented somewhere; it's probably worth
> > adding something to docs/migration.txt about how this measurement works.
> > 
> > 
> > > +    trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
> > > +}
> > > +
> > > +void mark_postcopy_downtime_end(uint64_t addr)
> > > +{
> > > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > > +    DowntimeDuration *dd;
> > > +    if (!mis->postcopy_downtime) {
> > > +        return;
> > > +    }
> > > +
> > > +    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
> > > +    if (!dd) {
> > > +        /* error_report("Could not populate downtime duration completion time \n\
> > > +                        There is no downtime duration for 0x%"PRIx64, addr); */
> > 
> > Error or no error - decide!   Is this happening for pages that arrive before
> > they've been requested?
> > 
> > > +        return;
> > > +    }
> > > +
> > > +    dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > > +    trace_mark_postcopy_downtime_end(addr, dd, dd->end);
> > > +}
> > > +
> > > +struct downtime_overlay_cxt {
> > > +    GPtrArray *downtime_points;
> > > +    size_t number_of_points;
> > > +};
> > 
> > Why 'cxt' ? If you mean as an abbreviation to context, then we normally use ctxt.
> > 
> > > +/*
> > > + * This function split each DowntimeDuration, which represents as start/end
> > > + * pointand makes a points of it, then fill array with points,
> > > + * to sort it in future.
> > > + */
> > > +static gboolean split_duration_and_fill_points(gpointer key, gpointer value,
> > > +                                        gpointer data)
> > > +{
> > > +    struct downtime_overlay_cxt *ctx = (struct downtime_overlay_cxt *)data;
> > > +    DowntimeDuration *dd = (DowntimeDuration *)value;
> > > +    GPtrArray *interval = ctx->downtime_points;
> > > +    if (dd->begin) {
> > > +        OverlapDowntime *od_begin = g_new0(OverlapDowntime, 1);
> > > +        od_begin->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > > +        od_begin->tp = dd->begin;
> > > +        od_begin->is_end = false;
> > > +        g_ptr_array_add(interval, od_begin);
> > > +        ctx->number_of_points += 1;
> > > +    }
> > > +
> > > +    if (dd->end) {
> > > +        OverlapDowntime *od_end = g_new0(OverlapDowntime, 1);
> > > +        od_end->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > > +        od_end->tp = dd->end;
> > > +        od_end->is_end = true;
> > > +        g_ptr_array_add(interval, od_end);
> > > +        ctx->number_of_points += 1;
> > > +    }
> > > +
> > > +    if (dd->end && dd->begin)
> > > +        trace_split_duration_and_fill_points(dd->end - dd->begin, (uint64_t)key);
> > 
> > again, need {}'s
> > 
> > > +    return FALSE;
> > > +}
> > > +
> > > +#ifdef DEBUG_VCPU_DOWNTIME
> > > +static gboolean calculate_per_cpu(gpointer key, gpointer value,
> > > +                                  gpointer data)
> > > +{
> > > +    int *downtime_cpu = (int *)data;
> > > +    DowntimeDuration *dd = (DowntimeDuration *)value;
> > > +    int cpu_iter;
> > > +    for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
> > > +        if (test_bit(cpu_iter, dd->cpus) && dd->end && dd->begin)
> > > +            downtime_cpu[cpu_iter] += dd->end - dd->begin;
> > > +    }
> > > +    return FALSE;
> > > +}
> > > +#endif /* DEBUG_VCPU_DOWNTIME */
> > > +
> > > +static gint compare_downtime(gconstpointer a, gconstpointer b)
> > > +{
> > > +    DowntimeDuration *dda = (DowntimeDuration *)a;
> > > +    DowntimeDuration *ddb = (DowntimeDuration *)b;
> > > +    return dda->begin - ddb->begin;
> > > +}
> > > +
> > > +static void destroy_overlap_downtime(gpointer data)
> > > +{
> > > +    OverlapDowntime *od = (OverlapDowntime *)data;
> > > +    g_free(od->cpus);
> > > +    g_free(data);
> > > +}
> > > +
> > > +static int check_overlap(uint64_t *b)
> > > +{
> > > +    unsigned long zero_bit = find_first_zero_bit(b, BITS_PER_LONG * SIZE_TO_KEEP_CPUBITS);
> > 
> > Line's too long.
> > 
> > > +    return zero_bit >= smp_cpus;
> > 
> > So this is really 'all cpus are blocked'?
> yes, that condition for it
> > 
> > > +}
> > > +
> > > +/*
> > > + * This function calculates downtime per cpu and trace it
> > > + *
> > > + *  Also it calculates total downtime as an interval's overlap,
> > > + *  for many vCPU.
> > > + *
> > > + *  The approach is following:
> > > + *  Initially intervals are represented in tree where key is
> > > + *  pagefault address, and values:
> > > + *   begin - page fault time
> > > + *   end   - page load time
> > > + *   cpus  - bit mask shows affected cpus
> > > + *
> > > + *  To calculate overlap on all cpus, intervals converted into
> > > + *  array of points in time (downtime_points), the size of
> > > + *  array is 2 * number of nodes in tree of intervals (2 array
> > > + *  elements per one in element of interval).
> > > + *  Each element is marked as end (E) or as start (S) of interval.
> > > + *  The overlap downtime will be calculated for SE, only in case
> > > + *  there is sequence S(0..N)E(M) for every vCPU.
> > > + *
> > > + * As example we have 3 CPU
> > > + *
> > > + *      S1        E1           S1               E1
> > > + * -----***********------------xxx***************------------------------> CPU1
> > > + *
> > > + *             S2                E2
> > > + * ------------****************xxx---------------------------------------> CPU2
> > > + *
> > > + *                         S3            E3
> > > + * ------------------------****xxx********-------------------------------> CPU3
> > > + *
> > > + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> > > + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
> > > + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
> >                        ^ typo
> > 
> > > + * Legend of picture is following: * - means downtime per vCPU
> > > + *                                 x - means overlapped downtime
> > > + */
> > > +uint64_t get_postcopy_total_downtime(void)
> > > +{
> > > +    MigrationIncomingState *mis = migration_incoming_get_current();
> > > +    uint64_t total_downtime = 0; /* for total overlapped downtime */
> > > +    const int intervals = g_tree_nnodes(mis->postcopy_downtime);
> > > +    int point_iter, start_point_iter, i;
> > > +    struct downtime_overlay_cxt dp_ctx = { 0 };
> > > +    /*
> > > +     * array will contain 2 * interval points or less, if
> > > +     * it was not page fault finalization for page,
> > > +     * real count will be in ctx.number_of_points
> > > +     */
> > > +    dp_ctx.downtime_points = g_ptr_array_new_full(2 * intervals,
> > > +                                                     destroy_overlap_downtime);
> > 
> > Is the g_ptr_array giving you anything here over a plain-old C array of pointers?
> > You're not dynamically growing it.
> Yes, I know upper bound of that array, at that time, and GPtrArray maybe
> is little bit heavy structure here. Ok I'll use plain array.
> 
> > 
> > > +    if (!mis->postcopy_downtime) {
> > > +        goto out;
> > > +    }
> > > +
> > > +#ifdef DEBUG_VCPU_DOWNTIME
> > > +    {
> > > +        gint *downtime_cpu = g_new0(int, smp_cpus);
> > > +        g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
> > > +        for (point_iter = 0; point_iter < smp_cpus; point_iter++)
> > > +        {
> > > +            trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
> > > +        }
> > > +        g_free(downtime_cpu);
> > > +    }
> > > +#endif /* DEBUG_VCPU_DOWNTIME */
> > 
> > You mgight want to make that:
> >   if (TRACE_DOWNTIME_PER_CPU_ENABLED) {
> >   }
> > 
> > and remove the ifdef.
> > 
> > > +    /* make downtime points S/E from interval */
> > > +    g_tree_foreach(mis->postcopy_downtime, split_duration_and_fill_points,
> > > +                   &dp_ctx);
> > > +    g_ptr_array_sort(dp_ctx.downtime_points, compare_downtime);
> > > +
> > > +    for (point_iter = 1; point_iter < dp_ctx.number_of_points;
> > > +         point_iter++) {
> > > +        OverlapDowntime *od = g_ptr_array_index(dp_ctx.downtime_points,
> > > +                point_iter);
> > > +        uint64_t *cur_cpus;
> > > +        int smp_cpus_i = smp_cpus;
> > > +        OverlapDowntime *prev_od = g_ptr_array_index(dp_ctx.downtime_points,
> > > +                                                     point_iter - 1);
> > > +        if (!od || !prev_od)
> > > +            continue;
> > 
> > Why would that happen?
> Now cycle goes till dp_ctx.number_of_points, so in this version it looks
> impossible.
> > 
> > > +        /* we need sequence SE */
> > > +        if (!od->is_end || prev_od->is_end)
> > > +            continue;
> > > +
> > > +        cur_cpus = g_memdup(od->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
> > > +        for (start_point_iter = point_iter - 1;
> > > +             start_point_iter >= 0 && smp_cpus_i;
> > > +             start_point_iter--, smp_cpus_i--) {
> > 
> > I think I see what you're doing in this loop, although it's a bit hairy;
> > I don't think I understand why we needed to get prev_od  if this loop is searching
> > backwards?
> Just for condition,
> if (!od->is_end || prev_od->is_end) {
>     continue;
> }
> to skip any other sequences, like EE,
> do you think following condition more readable?
> !(od->is_end && !prev_od->is_end) 
>     continue;
> 
> Also prev_od is  nearest point to end, so time since that point to end
> is interesting.
> I depicted that.
> > 
> > > +            OverlapDowntime *t_od = g_ptr_array_index(dp_ctx.downtime_points,
> > > +                                                      start_point_iter);
> > > +            if (!t_od)
> > > +                break;
> > > +            /* should be S */
> > > +            if (t_od->is_end)
> > > +                break;
> > > +
> > > +            /* points were sorted, it's possible when
> > > +             * end is not occured, but this points were ommited
> > > +             * in split_duration_and_fill_points */
> > > +            if (od->tp <= prev_od->tp) {
> > 
> > Why is this checking od and prev_od in this loop - isn't this
> > loop mainly t_od ?
> right, that code shouldn't be here.
> > 
> > > +                break;
> > > +            }
> > > +
> > > +            for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
> > > +                cur_cpus[i] |= t_od->cpus[i];
> > > +            }
> > > +
> > > +            /* check_overlap - just count number of bits in cur_cpus,
> > > +             * and compare it with smp_cpus */
> > > +            if (check_overlap(cur_cpus)) {
> > > +                total_downtime += od->tp - prev_od->tp;
> > > +                /* situation when one S point represents all vCPU is possible */
> > > +                break;
> > > +            }
> > > +        }
> > > +        g_free(cur_cpus);
> > > +    }
> > > +    trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
> > > +        total_downtime);
> > > +out:
> > > +    g_ptr_array_free(dp_ctx.downtime_points, TRUE);
> > > +    return total_downtime;
> > > +}
> > > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> > > index 70f0480..ea89f4e 100644
> > > --- a/migration/postcopy-ram.c
> > > +++ b/migration/postcopy-ram.c
> > > @@ -23,8 +23,10 @@
> > >  #include "migration/postcopy-ram.h"
> > >  #include "sysemu/sysemu.h"
> > >  #include "sysemu/balloon.h"
> > > +#include <sys/param.h>
> > >  #include "qemu/error-report.h"
> > >  #include "trace.h"
> > > +#include "glib/glib-helper.h"
> > >  
> > >  /* Arbitrary limit on size of each discard command,
> > >   * keeps them around ~200 bytes
> > > @@ -81,6 +83,11 @@ static bool ufd_version_check(int ufd, MigrationIncomingState *mis)
> > >          return false;
> > >      }
> > >  
> > > +    if (mis && UFFD_FEATURE_THREAD_ID & api_struct.features) {
> > 
> > That's a very weird way of writing that test!  Also, I think you need
> > to still make this user-selectable given the complexity/cost.
> >
> Like that?
> {"execute": "migrate-set-capabilities" , "arguments":
> { "capabilities": [ { "capability": "calculate-postcopy-downtime", "state": true } ]
> } }                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
> 
> I tried to put heavy operations much after hot path (page requesting and
> copying). The algorithm complexity is NumberOfPage*NumberOfvCPU, in case
> of hugepages it's not so many.
> Ok, if it's conditionally obtained from kernel, why not to give a user
> ability to choose.
> 
> > > +        mis->postcopy_downtime = g_tree_new_full(g_int_cmp64,
> > > +                                         NULL, NULL, destroy_downtime_duration);
> > > +    }
> > > +
> > >      if (getpagesize() != ram_pagesize_summary()) {
> > >          bool have_hp = false;
> > >          /* We've got a huge page */
> > > @@ -404,6 +411,18 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,
> > >      return 0;
> > >  }
> > >  
> > > +static int get_mem_fault_cpu_index(uint32_t pid)
> > > +{
> > > +    CPUState *cpu_iter;
> > > +
> > > +    CPU_FOREACH(cpu_iter) {
> > > +        if (cpu_iter->thread_id == pid)
> > > +           return cpu_iter->cpu_index;
> > > +    }
> > > +    trace_get_mem_fault_cpu_index(pid);
> > > +    return -1;
> > > +}
> > > +
> > >  /*
> > >   * Handle faults detected by the USERFAULT markings
> > >   */
> > > @@ -481,8 +500,10 @@ static void *postcopy_ram_fault_thread(void *opaque)
> > >          rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
> > >          trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
> > >                                                  qemu_ram_get_idstr(rb),
> > > -                                                rb_offset);
> > > +                                                rb_offset, msg.arg.pagefault.feat.ptid);
> > 
> > Line length!
> > 
> > >  
> > > +        mark_postcopy_downtime_begin(msg.arg.pagefault.address,
> > > +                            get_mem_fault_cpu_index(msg.arg.pagefault.feat.ptid));
> > >          /*
> > >           * Send the request to the source - we want to request one
> > >           * of our host page sizes (which is >= TPS)
> > > @@ -577,6 +598,7 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
> > >  
> > >          return -e;
> > >      }
> > > +    mark_postcopy_downtime_end((uint64_t)host);
> > >  
> > >      trace_postcopy_place_page(host);
> > >      return 0;
> > > diff --git a/migration/qemu-file.c b/migration/qemu-file.c
> > > index 195fa94..c9f3e47 100644
> > > --- a/migration/qemu-file.c
> > > +++ b/migration/qemu-file.c
> > > @@ -547,7 +547,6 @@ size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
> > >  int qemu_peek_byte(QEMUFile *f, int offset)
> > >  {
> > >      int index = f->buf_index + offset;
> > > -
> > 
> > Stray!
> > 
> > >      assert(!qemu_file_is_writable(f));
> > >      assert(offset < IO_BUF_SIZE);
> > >  
> > > diff --git a/migration/trace-events b/migration/trace-events
> > > index 7372ce2..ab2e1e4 100644
> > > --- a/migration/trace-events
> > > +++ b/migration/trace-events
> > > @@ -110,6 +110,12 @@ process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
> > >  process_incoming_migration_co_postcopy_end_main(void) ""
> > >  migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
> > >  migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  "ioc=%p ioctype=%s hostname=%s"
> > > +mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu) "addr 0x%" PRIx64 " dd %p time %" PRId64 " cpu %d"
> > > +mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%" PRIx64 " dd %p time %" PRId64
> > > +get_postcopy_total_downtime(int num, uint64_t total) "faults %d, total downtime %" PRIu64
> > > +split_duration_and_fill_points(int64_t downtime, uint64_t addr) "downtime %" PRId64 " addr 0x%" PRIx64
> > > +downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"
> > > +source_return_path_thread_downtime(uint64_t downtime) "downtime %" PRIu64
> > >  
> > >  # migration/rdma.c
> > >  qemu_rdma_accept_incoming_migration(void) ""
> > > @@ -186,7 +192,7 @@ postcopy_ram_enable_notify(void) ""
> > >  postcopy_ram_fault_thread_entry(void) ""
> > >  postcopy_ram_fault_thread_exit(void) ""
> > >  postcopy_ram_fault_thread_quit(void) ""
> > > -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
> > > +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, int pid) "Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
> > >  postcopy_ram_incoming_cleanup_closeuf(void) ""
> > >  postcopy_ram_incoming_cleanup_entry(void) ""
> > >  postcopy_ram_incoming_cleanup_exit(void) ""
> > > @@ -195,6 +201,7 @@ save_xbzrle_page_skipping(void) ""
> > >  save_xbzrle_page_overflow(void) ""
> > >  ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
> > >  ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
> > > +get_mem_fault_cpu_index(uint32_t pid) "pid %u is not vCPU"
> > >  
> > >  # migration/exec.c
> > >  migration_exec_outgoing(const char *cmd) "cmd=%s"
> > > -- 
> > > 1.8.3.1
> > > 
> > 
> > Dave
> > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> > 
> 
> -- 
> 
> BR
> Alexey
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert April 24, 2017, 5:13 p.m. UTC | #5
* Alexey (a.perevalov@samsung.com) wrote:
> Hello David,
> this mail just for CPUMASK discussion.
> 
> On Fri, Apr 21, 2017 at 01:00:32PM +0100, Dr. David Alan Gilbert wrote:
> > * Alexey Perevalov (a.perevalov@samsung.com) wrote:
> > > This patch provides downtime calculation per vCPU,
> > > as a summary and as a overlapped value for all vCPUs.
> > > 
> > > This approach just keeps tree with page fault addr as a key,
> > > and t1-t2 interval of pagefault time and page copy time, with
> > > affected vCPU bit mask.
> > > For more implementation details please see comment to
> > > get_postcopy_total_downtime function.
> > > 
> > > Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>
> > > ---
> > >  include/migration/migration.h |  14 +++
> > >  migration/migration.c         | 280 +++++++++++++++++++++++++++++++++++++++++-
> > >  migration/postcopy-ram.c      |  24 +++-
> > >  migration/qemu-file.c         |   1 -
> > >  migration/trace-events        |   9 +-
> > >  5 files changed, 323 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > > index 5720c88..5d2c628 100644
> > > --- a/include/migration/migration.h
> > > +++ b/include/migration/migration.h
> > > @@ -123,10 +123,24 @@ struct MigrationIncomingState {
> > >  
> > >      /* See savevm.c */
> > >      LoadStateEntry_Head loadvm_handlers;
> > > +
> > > +    /*
> > > +     *  Tree for keeping postcopy downtime,
> > > +     *  necessary to calculate correct downtime, during multiple
> > > +     *  vm suspends, it keeps host page address as a key and
> > > +     *  DowntimeDuration as a data
> > > +     *  NULL means kernel couldn't provide process thread id,
> > > +     *  and QEMU couldn't identify which vCPU raise page fault
> > > +     */
> > > +    GTree *postcopy_downtime;
> > >  };
> > >  
> > >  MigrationIncomingState *migration_incoming_get_current(void);
> > >  void migration_incoming_state_destroy(void);
> > > +void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
> > > +void mark_postcopy_downtime_end(uint64_t addr);
> > > +uint64_t get_postcopy_total_downtime(void);
> > > +void destroy_downtime_duration(gpointer data);
> > >  
> > >  /*
> > >   * An outstanding page request, on the source, having been received
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index 79f6425..5bac434 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -38,6 +38,8 @@
> > >  #include "io/channel-tls.h"
> > >  #include "migration/colo.h"
> > >  
> > > +#define DEBUG_VCPU_DOWNTIME 1
> > > +
> > >  #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
> > >  
> > >  /* Amount of time to allocate to each "chunk" of bandwidth-throttled
> > > @@ -77,6 +79,19 @@ static NotifierList migration_state_notifiers =
> > >  
> > >  static bool deferred_incoming;
> > >  
> > > +typedef struct {
> > > +    int64_t begin;
> > > +    int64_t end;
> > > +    uint64_t *cpus; /* cpus bit mask array, QEMU bit functions support
> > > +     bit operation on memory regions, but doesn't check out of range */
> > > +} DowntimeDuration;
> > > +
> > > +typedef struct {
> > > +    int64_t tp; /* point in time */
> > > +    bool is_end;
> > > +    uint64_t *cpus;
> > > +} OverlapDowntime;
> > > +
> > >  /*
> > >   * Current state of incoming postcopy; note this is not part of
> > >   * MigrationIncomingState since it's state is used during cleanup
> > > @@ -117,6 +132,13 @@ MigrationState *migrate_get_current(void)
> > >      return &current_migration;
> > >  }
> > >  
> > > +void destroy_downtime_duration(gpointer data)
> > > +{
> > > +    DowntimeDuration *dd = (DowntimeDuration *)data;
> > > +    g_free(dd->cpus);
> > > +    g_free(data);
> > > +}
> > > +
> > >  MigrationIncomingState *migration_incoming_get_current(void)
> > >  {
> > >      static bool once;
> > > @@ -138,10 +160,13 @@ void migration_incoming_state_destroy(void)
> > >      struct MigrationIncomingState *mis = migration_incoming_get_current();
> > >  
> > >      qemu_event_destroy(&mis->main_thread_load_event);
> > > +    if (mis->postcopy_downtime) {
> > > +        g_tree_destroy(mis->postcopy_downtime);
> > > +        mis->postcopy_downtime = NULL;
> > > +    }
> > >      loadvm_free_handlers(mis);
> > >  }
> > >  
> > > -
> > >  typedef struct {
> > >      bool optional;
> > >      uint32_t size;
> > > @@ -1754,7 +1779,6 @@ static int postcopy_start(MigrationState *ms, bool *old_vm_running)
> > >       */
> > >      ms->postcopy_after_devices = true;
> > >      notifier_list_notify(&migration_state_notifiers, ms);
> > > -
> > 
> > Stray deletion
> > 
> > >      ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
> > >  
> > >      qemu_mutex_unlock_iothread();
> > > @@ -2117,3 +2141,255 @@ PostcopyState postcopy_state_set(PostcopyState new_state)
> > >      return atomic_xchg(&incoming_postcopy_state, new_state);
> > >  }
> > >  
> > > +#define SIZE_TO_KEEP_CPUBITS (1 + smp_cpus/sizeof(guint64))
> > 
> > Split out your cpu-sets so that you have an 'alloc_cpu_set',
> > a 'set bit' a 'set all bits', dup etc
> > (I see Linux has cpumask.h that has a 'cpu_set' that's
> > basically the same thing, but we need something portablish.)
> > 
> Agree, the way I'm working with cpumask is little bit naive.
> instead of set all_cpumask in case when all vCPU are sleeping with precision
> ((1 << smp_cpus) - 1), I just set ~0 it all, because I didn't use
> functions like cpumask_and.
> If you think, this patch should use cpumask, cpumask patchset/separate
> thread should be introduced before, and then this patchset should be
> rebased on top of it.

Yes, some functions like that - because then it makes this code
clearer; and there's less repetition.

Dave

> 
> BR
> Alexey
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu April 25, 2017, 8:24 a.m. UTC | #6
On Fri, Apr 14, 2017 at 04:17:18PM +0300, Alexey Perevalov wrote:

[...]

> +/*
> + * This function calculates downtime per cpu and trace it
> + *
> + *  Also it calculates total downtime as an interval's overlap,
> + *  for many vCPU.
> + *
> + *  The approach is following:
> + *  Initially intervals are represented in tree where key is
> + *  pagefault address, and values:
> + *   begin - page fault time
> + *   end   - page load time
> + *   cpus  - bit mask shows affected cpus
> + *
> + *  To calculate overlap on all cpus, intervals converted into
> + *  array of points in time (downtime_points), the size of
> + *  array is 2 * number of nodes in tree of intervals (2 array
> + *  elements per one in element of interval).
> + *  Each element is marked as end (E) or as start (S) of interval.
> + *  The overlap downtime will be calculated for SE, only in case
> + *  there is sequence S(0..N)E(M) for every vCPU.
> + *
> + * As example we have 3 CPU
> + *
> + *      S1        E1           S1               E1
> + * -----***********------------xxx***************------------------------> CPU1
> + *
> + *             S2                E2
> + * ------------****************xxx---------------------------------------> CPU2
> + *
> + *                         S3            E3
> + * ------------------------****xxx********-------------------------------> CPU3
> + *
> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
> + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
> + * Legend of picture is following: * - means downtime per vCPU
> + *                                 x - means overlapped downtime
> + */

Not sure whether I get the point in this patch... iiuc we defined the
downtime here as the period when all vcpus are halted, right?

If so, I have a few questions:

- will this algorithm consume lots of memory? since I see we have one
  trace object per fault page address

- do we need to protect the tree to make sure there's no insertion
  when doing the calculation?

- if the only thing we want here is the "total downtime", whether
  below would work? (assuming N is vcpu numbers)

  a. define array cpu_fault_addr[N], to store current faulted address
     for each vcpu. When vcpu X is running, cpu_fault_addr[X] should
     be 0.

  b. when page fault happens on vcpu A, setup cpu_fault_addr[A] with
     corresponding fault address.

  c. when page copy finished, loop over cpu_fault_addr[] to see
     whether that matches any, clear corresponding element if matched.

  Then, we can just measure the period when cpu_fault_addr[] is all
  set (by tracing at both b. and c.). Can this work?

Thanks,
Alexey Perevalov April 25, 2017, 10:10 a.m. UTC | #7
On 04/25/2017 11:24 AM, Peter Xu wrote:
> On Fri, Apr 14, 2017 at 04:17:18PM +0300, Alexey Perevalov wrote:
>
> [...]
>
>> +/*
>> + * This function calculates downtime per cpu and trace it
>> + *
>> + *  Also it calculates total downtime as an interval's overlap,
>> + *  for many vCPU.
>> + *
>> + *  The approach is following:
>> + *  Initially intervals are represented in tree where key is
>> + *  pagefault address, and values:
>> + *   begin - page fault time
>> + *   end   - page load time
>> + *   cpus  - bit mask shows affected cpus
>> + *
>> + *  To calculate overlap on all cpus, intervals converted into
>> + *  array of points in time (downtime_points), the size of
>> + *  array is 2 * number of nodes in tree of intervals (2 array
>> + *  elements per one in element of interval).
>> + *  Each element is marked as end (E) or as start (S) of interval.
>> + *  The overlap downtime will be calculated for SE, only in case
>> + *  there is sequence S(0..N)E(M) for every vCPU.
>> + *
>> + * As example we have 3 CPU
>> + *
>> + *      S1        E1           S1               E1
>> + * -----***********------------xxx***************------------------------> CPU1
>> + *
>> + *             S2                E2
>> + * ------------****************xxx---------------------------------------> CPU2
>> + *
>> + *                         S3            E3
>> + * ------------------------****xxx********-------------------------------> CPU3
>> + *
>> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
>> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
>> + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
>> + * Legend of picture is following: * - means downtime per vCPU
>> + *                                 x - means overlapped downtime
>> + */
> Not sure whether I get the point in this patch... iiuc we defined the
> downtime here as the period when all vcpus are halted, right?
>
> If so, I have a few questions:
>
> - will this algorithm consume lots of memory? since I see we have one
>    trace object per fault page address
I don't think, it consumes too much, one DowntimeDuration
takes (if I'm using bitmap_try_new, in this patch set I used pointer to 
uint64_t array to keep bitmap array,
but I'm going to use include/qemu/bitmap.h, it works with pointers to long)

(2* int64 + (ROUND_UP((smp_cpus + BITS_PER_BYTE * sizeof(long) - 1 / 
(BITS_PER_BYTE * sizeof(long)))) * siezof(long)
so it's about 16 + at least 4 bytes, per page fault,
Lets assume we migration 256 vCPU and 256 Gb of ram and that ram is 
based on 4Kb pages - it's really bad case
16 + ((256 + 8 * 4 - 1) / ( 8 * 4 )) * 4 = 52 bytes
(256 * 1024 * 1024 * 1024)/(4 * 1024) = 67108864 page faults, but not 
all of these pages will be pagefaulted, due to
page pre-fetching
67108864 * 52 = 3489660928 bytes (3.5 Gb for that operation),
but I have a doubt, who will use 4Kb pages for 256 Gb, probably
2Mb or 1G huge page will be chosen on x86, on ARM or other architecture 
it could be another values.

>
> - do we need to protect the tree to make sure there's no insertion
>    when doing the calculation?
I asked the same question when sent RFC patches,
the answer here is no, we should not, due to right now,
it's only one socket and one listen thread (maybe in future,
it will be required, maybe after multi fd patch set),
and calculation is doing synchronously right after migration complete.

>
> - if the only thing we want here is the "total downtime", whether
>    below would work? (assuming N is vcpu numbers)
>
>    a. define array cpu_fault_addr[N], to store current faulted address
>       for each vcpu. When vcpu X is running, cpu_fault_addr[X] should
>       be 0.
>
>    b. when page fault happens on vcpu A, setup cpu_fault_addr[A] with
>       corresponding fault address.
at this time need to is fault happens for all another vCPU,
and if it happens mark current time as total vCPU downtime start.

>    c. when page copy finished, loop over cpu_fault_addr[] to see
>       whether that matches any, clear corresponding element if matched.
so when page copy finished and mark for total vCPU is set,
yes that interval is a part of total downtime.
>
>    Then, we can just measure the period when cpu_fault_addr[] is all
>    set (by tracing at both b. and c.). Can this work?
Yes, it works, but it's better to keep time - cpu_fault_time,
address is not important here, it doesn't matter the reason of pagefault.
2 vCPU could fault due to access to one page, ok, it's not a problem, 
just store
time when it was faulted.
Looks like it's better algorithm, with lesser complexity,
thank you a lot.


>
> Thanks,
>
Peter Xu April 25, 2017, 10:25 a.m. UTC | #8
On Tue, Apr 25, 2017 at 01:10:30PM +0300, Alexey Perevalov wrote:
> On 04/25/2017 11:24 AM, Peter Xu wrote:
> >On Fri, Apr 14, 2017 at 04:17:18PM +0300, Alexey Perevalov wrote:
> >
> >[...]
> >
> >>+/*
> >>+ * This function calculates downtime per cpu and trace it
> >>+ *
> >>+ *  Also it calculates total downtime as an interval's overlap,
> >>+ *  for many vCPU.
> >>+ *
> >>+ *  The approach is following:
> >>+ *  Initially intervals are represented in tree where key is
> >>+ *  pagefault address, and values:
> >>+ *   begin - page fault time
> >>+ *   end   - page load time
> >>+ *   cpus  - bit mask shows affected cpus
> >>+ *
> >>+ *  To calculate overlap on all cpus, intervals converted into
> >>+ *  array of points in time (downtime_points), the size of
> >>+ *  array is 2 * number of nodes in tree of intervals (2 array
> >>+ *  elements per one in element of interval).
> >>+ *  Each element is marked as end (E) or as start (S) of interval.
> >>+ *  The overlap downtime will be calculated for SE, only in case
> >>+ *  there is sequence S(0..N)E(M) for every vCPU.
> >>+ *
> >>+ * As example we have 3 CPU
> >>+ *
> >>+ *      S1        E1           S1               E1
> >>+ * -----***********------------xxx***************------------------------> CPU1
> >>+ *
> >>+ *             S2                E2
> >>+ * ------------****************xxx---------------------------------------> CPU2
> >>+ *
> >>+ *                         S3            E3
> >>+ * ------------------------****xxx********-------------------------------> CPU3
> >>+ *
> >>+ * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
> >>+ * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
> >>+ * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
> >>+ * Legend of picture is following: * - means downtime per vCPU
> >>+ *                                 x - means overlapped downtime
> >>+ */
> >Not sure whether I get the point in this patch... iiuc we defined the
> >downtime here as the period when all vcpus are halted, right?
> >
> >If so, I have a few questions:
> >
> >- will this algorithm consume lots of memory? since I see we have one
> >   trace object per fault page address
> I don't think, it consumes too much, one DowntimeDuration
> takes (if I'm using bitmap_try_new, in this patch set I used pointer to
> uint64_t array to keep bitmap array,
> but I'm going to use include/qemu/bitmap.h, it works with pointers to long)
> 
> (2* int64 + (ROUND_UP((smp_cpus + BITS_PER_BYTE * sizeof(long) - 1 /
> (BITS_PER_BYTE * sizeof(long)))) * siezof(long)
> so it's about 16 + at least 4 bytes, per page fault,
> Lets assume we migration 256 vCPU and 256 Gb of ram and that ram is based on
> 4Kb pages - it's really bad case
> 16 + ((256 + 8 * 4 - 1) / ( 8 * 4 )) * 4 = 52 bytes
> (256 * 1024 * 1024 * 1024)/(4 * 1024) = 67108864 page faults, but not all of
> these pages will be pagefaulted, due to
> page pre-fetching
> 67108864 * 52 = 3489660928 bytes (3.5 Gb for that operation),
> but I have a doubt, who will use 4Kb pages for 256 Gb, probably
> 2Mb or 1G huge page will be chosen on x86, on ARM or other architecture it
> could be another values.

Hmm, it looks still big though...

> 
> >
> >- do we need to protect the tree to make sure there's no insertion
> >   when doing the calculation?
> I asked the same question when sent RFC patches,
> the answer here is no, we should not, due to right now,
> it's only one socket and one listen thread (maybe in future,
> it will be required, maybe after multi fd patch set),
> and calculation is doing synchronously right after migration complete.

Okay.

> 
> >
> >- if the only thing we want here is the "total downtime", whether
> >   below would work? (assuming N is vcpu numbers)
> >
> >   a. define array cpu_fault_addr[N], to store current faulted address
> >      for each vcpu. When vcpu X is running, cpu_fault_addr[X] should
> >      be 0.
> >
> >   b. when page fault happens on vcpu A, setup cpu_fault_addr[A] with
> >      corresponding fault address.
> at this time need to is fault happens for all another vCPU,
> and if it happens mark current time as total vCPU downtime start.
> 
> >   c. when page copy finished, loop over cpu_fault_addr[] to see
> >      whether that matches any, clear corresponding element if matched.
> so when page copy finished and mark for total vCPU is set,
> yes that interval is a part of total downtime.
> >
> >   Then, we can just measure the period when cpu_fault_addr[] is all
> >   set (by tracing at both b. and c.). Can this work?
> Yes, it works, but it's better to keep time - cpu_fault_time,
> address is not important here, it doesn't matter the reason of pagefault.

We still need the addresses? So that when we do COPY, we can check the
new page address against these stored ones, to know which vcpus to
clear the bit.

> 2 vCPU could fault due to access to one page, ok, it's not a problem, just
> store
> time when it was faulted.
> Looks like it's better algorithm, with lesser complexity,
> thank you a lot.

My pleasure. Thanks,
Alexey Perevalov April 25, 2017, 10:47 a.m. UTC | #9
On 04/25/2017 01:25 PM, Peter Xu wrote:
> On Tue, Apr 25, 2017 at 01:10:30PM +0300, Alexey Perevalov wrote:
>> On 04/25/2017 11:24 AM, Peter Xu wrote:
>>> On Fri, Apr 14, 2017 at 04:17:18PM +0300, Alexey Perevalov wrote:
>>>
>>> [...]
>>>
>>>> +/*
>>>> + * This function calculates downtime per cpu and trace it
>>>> + *
>>>> + *  Also it calculates total downtime as an interval's overlap,
>>>> + *  for many vCPU.
>>>> + *
>>>> + *  The approach is following:
>>>> + *  Initially intervals are represented in tree where key is
>>>> + *  pagefault address, and values:
>>>> + *   begin - page fault time
>>>> + *   end   - page load time
>>>> + *   cpus  - bit mask shows affected cpus
>>>> + *
>>>> + *  To calculate overlap on all cpus, intervals converted into
>>>> + *  array of points in time (downtime_points), the size of
>>>> + *  array is 2 * number of nodes in tree of intervals (2 array
>>>> + *  elements per one in element of interval).
>>>> + *  Each element is marked as end (E) or as start (S) of interval.
>>>> + *  The overlap downtime will be calculated for SE, only in case
>>>> + *  there is sequence S(0..N)E(M) for every vCPU.
>>>> + *
>>>> + * As example we have 3 CPU
>>>> + *
>>>> + *      S1        E1           S1               E1
>>>> + * -----***********------------xxx***************------------------------> CPU1
>>>> + *
>>>> + *             S2                E2
>>>> + * ------------****************xxx---------------------------------------> CPU2
>>>> + *
>>>> + *                         S3            E3
>>>> + * ------------------------****xxx********-------------------------------> CPU3
>>>> + *
>>>> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
>>>> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
>>>> + * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
>>>> + * Legend of picture is following: * - means downtime per vCPU
>>>> + *                                 x - means overlapped downtime
>>>> + */
>>> Not sure whether I get the point in this patch... iiuc we defined the
>>> downtime here as the period when all vcpus are halted, right?
>>>
>>> If so, I have a few questions:
>>>
>>> - will this algorithm consume lots of memory? since I see we have one
>>>    trace object per fault page address
>> I don't think, it consumes too much, one DowntimeDuration
>> takes (if I'm using bitmap_try_new, in this patch set I used pointer to
>> uint64_t array to keep bitmap array,
>> but I'm going to use include/qemu/bitmap.h, it works with pointers to long)
>>
>> (2* int64 + (ROUND_UP((smp_cpus + BITS_PER_BYTE * sizeof(long) - 1 /
>> (BITS_PER_BYTE * sizeof(long)))) * siezof(long)
>> so it's about 16 + at least 4 bytes, per page fault,
>> Lets assume we migration 256 vCPU and 256 Gb of ram and that ram is based on
>> 4Kb pages - it's really bad case
>> 16 + ((256 + 8 * 4 - 1) / ( 8 * 4 )) * 4 = 52 bytes
>> (256 * 1024 * 1024 * 1024)/(4 * 1024) = 67108864 page faults, but not all of
>> these pages will be pagefaulted, due to
>> page pre-fetching
>> 67108864 * 52 = 3489660928 bytes (3.5 Gb for that operation),
>> but I have a doubt, who will use 4Kb pages for 256 Gb, probably
>> 2Mb or 1G huge page will be chosen on x86, on ARM or other architecture it
>> could be another values.
> Hmm, it looks still big though...
>
>>> - do we need to protect the tree to make sure there's no insertion
>>>    when doing the calculation?
>> I asked the same question when sent RFC patches,
>> the answer here is no, we should not, due to right now,
>> it's only one socket and one listen thread (maybe in future,
>> it will be required, maybe after multi fd patch set),
>> and calculation is doing synchronously right after migration complete.
> Okay.
>
>>> - if the only thing we want here is the "total downtime", whether
>>>    below would work? (assuming N is vcpu numbers)
>>>
>>>    a. define array cpu_fault_addr[N], to store current faulted address
>>>       for each vcpu. When vcpu X is running, cpu_fault_addr[X] should
>>>       be 0.
>>>
>>>    b. when page fault happens on vcpu A, setup cpu_fault_addr[A] with
>>>       corresponding fault address.
>> at this time need to is fault happens for all another vCPU,
>> and if it happens mark current time as total vCPU downtime start.
>>
>>>    c. when page copy finished, loop over cpu_fault_addr[] to see
>>>       whether that matches any, clear corresponding element if matched.
>> so when page copy finished and mark for total vCPU is set,
>> yes that interval is a part of total downtime.
>>>    Then, we can just measure the period when cpu_fault_addr[] is all
>>>    set (by tracing at both b. and c.). Can this work?
>> Yes, it works, but it's better to keep time - cpu_fault_time,
>> address is not important here, it doesn't matter the reason of pagefault.
> We still need the addresses? So that when we do COPY, we can check the
> new page address against these stored ones, to know which vcpus to
> clear the bit.
Frankly say, we need ) because there is not another way to determine
vCPU at COPY time.

>
>> 2 vCPU could fault due to access to one page, ok, it's not a problem, just
>> store
>> time when it was faulted.
>> Looks like it's better algorithm, with lesser complexity,
>> thank you a lot.
> My pleasure. Thanks,
>
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 5720c88..5d2c628 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -123,10 +123,24 @@  struct MigrationIncomingState {
 
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
+
+    /*
+     *  Tree for keeping postcopy downtime,
+     *  necessary to calculate correct downtime, during multiple
+     *  vm suspends, it keeps host page address as a key and
+     *  DowntimeDuration as a data
+     *  NULL means kernel couldn't provide process thread id,
+     *  and QEMU couldn't identify which vCPU raise page fault
+     */
+    GTree *postcopy_downtime;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
 void migration_incoming_state_destroy(void);
+void mark_postcopy_downtime_begin(uint64_t addr, int cpu);
+void mark_postcopy_downtime_end(uint64_t addr);
+uint64_t get_postcopy_total_downtime(void);
+void destroy_downtime_duration(gpointer data);
 
 /*
  * An outstanding page request, on the source, having been received
diff --git a/migration/migration.c b/migration/migration.c
index 79f6425..5bac434 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -38,6 +38,8 @@ 
 #include "io/channel-tls.h"
 #include "migration/colo.h"
 
+#define DEBUG_VCPU_DOWNTIME 1
+
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 
 /* Amount of time to allocate to each "chunk" of bandwidth-throttled
@@ -77,6 +79,19 @@  static NotifierList migration_state_notifiers =
 
 static bool deferred_incoming;
 
+typedef struct {
+    int64_t begin;
+    int64_t end;
+    uint64_t *cpus; /* cpus bit mask array, QEMU bit functions support
+     bit operation on memory regions, but doesn't check out of range */
+} DowntimeDuration;
+
+typedef struct {
+    int64_t tp; /* point in time */
+    bool is_end;
+    uint64_t *cpus;
+} OverlapDowntime;
+
 /*
  * Current state of incoming postcopy; note this is not part of
  * MigrationIncomingState since it's state is used during cleanup
@@ -117,6 +132,13 @@  MigrationState *migrate_get_current(void)
     return &current_migration;
 }
 
+void destroy_downtime_duration(gpointer data)
+{
+    DowntimeDuration *dd = (DowntimeDuration *)data;
+    g_free(dd->cpus);
+    g_free(data);
+}
+
 MigrationIncomingState *migration_incoming_get_current(void)
 {
     static bool once;
@@ -138,10 +160,13 @@  void migration_incoming_state_destroy(void)
     struct MigrationIncomingState *mis = migration_incoming_get_current();
 
     qemu_event_destroy(&mis->main_thread_load_event);
+    if (mis->postcopy_downtime) {
+        g_tree_destroy(mis->postcopy_downtime);
+        mis->postcopy_downtime = NULL;
+    }
     loadvm_free_handlers(mis);
 }
 
-
 typedef struct {
     bool optional;
     uint32_t size;
@@ -1754,7 +1779,6 @@  static int postcopy_start(MigrationState *ms, bool *old_vm_running)
      */
     ms->postcopy_after_devices = true;
     notifier_list_notify(&migration_state_notifiers, ms);
-
     ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
 
     qemu_mutex_unlock_iothread();
@@ -2117,3 +2141,255 @@  PostcopyState postcopy_state_set(PostcopyState new_state)
     return atomic_xchg(&incoming_postcopy_state, new_state);
 }
 
+#define SIZE_TO_KEEP_CPUBITS (1 + smp_cpus/sizeof(guint64))
+
+void mark_postcopy_downtime_begin(uint64_t addr, int cpu)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    DowntimeDuration *dd;
+    if (!mis->postcopy_downtime) {
+        return;
+    }
+
+    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr); /* !!! cast */
+    if (!dd) {
+        dd = (DowntimeDuration *)g_new0(DowntimeDuration, 1);
+        dd->cpus = g_new0(guint64, SIZE_TO_KEEP_CPUBITS);
+        g_tree_insert(mis->postcopy_downtime, (gpointer)addr, (gpointer)dd);
+    }
+
+    if (cpu < 0) {
+        /* assume in this situation all vCPUs are sleeping */
+        int i;
+        for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
+            dd->cpus[i] = ~(uint64_t)0u;
+        }
+    } else
+        set_bit(cpu, dd->cpus);
+
+    /*
+     *  overwrite previously set dd->begin, if that page already was
+     *     faulted on another cpu
+     */
+    dd->begin = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    trace_mark_postcopy_downtime_begin(addr, dd, dd->begin, cpu);
+}
+
+void mark_postcopy_downtime_end(uint64_t addr)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    DowntimeDuration *dd;
+    if (!mis->postcopy_downtime) {
+        return;
+    }
+
+    dd = g_tree_lookup(mis->postcopy_downtime, (gpointer)addr);
+    if (!dd) {
+        /* error_report("Could not populate downtime duration completion time \n\
+                        There is no downtime duration for 0x%"PRIx64, addr); */
+        return;
+    }
+
+    dd->end = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    trace_mark_postcopy_downtime_end(addr, dd, dd->end);
+}
+
+struct downtime_overlay_cxt {
+    GPtrArray *downtime_points;
+    size_t number_of_points;
+};
+/*
+ * This function split each DowntimeDuration, which represents as start/end
+ * pointand makes a points of it, then fill array with points,
+ * to sort it in future.
+ */
+static gboolean split_duration_and_fill_points(gpointer key, gpointer value,
+                                        gpointer data)
+{
+    struct downtime_overlay_cxt *ctx = (struct downtime_overlay_cxt *)data;
+    DowntimeDuration *dd = (DowntimeDuration *)value;
+    GPtrArray *interval = ctx->downtime_points;
+    if (dd->begin) {
+        OverlapDowntime *od_begin = g_new0(OverlapDowntime, 1);
+        od_begin->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
+        od_begin->tp = dd->begin;
+        od_begin->is_end = false;
+        g_ptr_array_add(interval, od_begin);
+        ctx->number_of_points += 1;
+    }
+
+    if (dd->end) {
+        OverlapDowntime *od_end = g_new0(OverlapDowntime, 1);
+        od_end->cpus = g_memdup(dd->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
+        od_end->tp = dd->end;
+        od_end->is_end = true;
+        g_ptr_array_add(interval, od_end);
+        ctx->number_of_points += 1;
+    }
+
+    if (dd->end && dd->begin)
+        trace_split_duration_and_fill_points(dd->end - dd->begin, (uint64_t)key);
+    return FALSE;
+}
+
+#ifdef DEBUG_VCPU_DOWNTIME
+static gboolean calculate_per_cpu(gpointer key, gpointer value,
+                                  gpointer data)
+{
+    int *downtime_cpu = (int *)data;
+    DowntimeDuration *dd = (DowntimeDuration *)value;
+    int cpu_iter;
+    for (cpu_iter = 0; cpu_iter < smp_cpus; cpu_iter++) {
+        if (test_bit(cpu_iter, dd->cpus) && dd->end && dd->begin)
+            downtime_cpu[cpu_iter] += dd->end - dd->begin;
+    }
+    return FALSE;
+}
+#endif /* DEBUG_VCPU_DOWNTIME */
+
+static gint compare_downtime(gconstpointer a, gconstpointer b)
+{
+    DowntimeDuration *dda = (DowntimeDuration *)a;
+    DowntimeDuration *ddb = (DowntimeDuration *)b;
+    return dda->begin - ddb->begin;
+}
+
+static void destroy_overlap_downtime(gpointer data)
+{
+    OverlapDowntime *od = (OverlapDowntime *)data;
+    g_free(od->cpus);
+    g_free(data);
+}
+
+static int check_overlap(uint64_t *b)
+{
+    unsigned long zero_bit = find_first_zero_bit(b, BITS_PER_LONG * SIZE_TO_KEEP_CPUBITS);
+    return zero_bit >= smp_cpus;
+}
+
+/*
+ * This function calculates downtime per cpu and trace it
+ *
+ *  Also it calculates total downtime as an interval's overlap,
+ *  for many vCPU.
+ *
+ *  The approach is following:
+ *  Initially intervals are represented in tree where key is
+ *  pagefault address, and values:
+ *   begin - page fault time
+ *   end   - page load time
+ *   cpus  - bit mask shows affected cpus
+ *
+ *  To calculate overlap on all cpus, intervals converted into
+ *  array of points in time (downtime_points), the size of
+ *  array is 2 * number of nodes in tree of intervals (2 array
+ *  elements per one in element of interval).
+ *  Each element is marked as end (E) or as start (S) of interval.
+ *  The overlap downtime will be calculated for SE, only in case
+ *  there is sequence S(0..N)E(M) for every vCPU.
+ *
+ * As example we have 3 CPU
+ *
+ *      S1        E1           S1               E1
+ * -----***********------------xxx***************------------------------> CPU1
+ *
+ *             S2                E2
+ * ------------****************xxx---------------------------------------> CPU2
+ *
+ *                         S3            E3
+ * ------------------------****xxx********-------------------------------> CPU3
+ *
+ * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
+ * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
+ * S3,S1,E2 - sequenece includes all CPUs, in this case overlap will be S1,E2
+ * Legend of picture is following: * - means downtime per vCPU
+ *                                 x - means overlapped downtime
+ */
+uint64_t get_postcopy_total_downtime(void)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    uint64_t total_downtime = 0; /* for total overlapped downtime */
+    const int intervals = g_tree_nnodes(mis->postcopy_downtime);
+    int point_iter, start_point_iter, i;
+    struct downtime_overlay_cxt dp_ctx = { 0 };
+    /*
+     * array will contain 2 * interval points or less, if
+     * it was not page fault finalization for page,
+     * real count will be in ctx.number_of_points
+     */
+    dp_ctx.downtime_points = g_ptr_array_new_full(2 * intervals,
+                                                     destroy_overlap_downtime);
+    if (!mis->postcopy_downtime) {
+        goto out;
+    }
+
+#ifdef DEBUG_VCPU_DOWNTIME
+    {
+        gint *downtime_cpu = g_new0(int, smp_cpus);
+        g_tree_foreach(mis->postcopy_downtime, calculate_per_cpu, downtime_cpu);
+        for (point_iter = 0; point_iter < smp_cpus; point_iter++)
+        {
+            trace_downtime_per_cpu(point_iter, downtime_cpu[point_iter]);
+        }
+        g_free(downtime_cpu);
+    }
+#endif /* DEBUG_VCPU_DOWNTIME */
+
+    /* make downtime points S/E from interval */
+    g_tree_foreach(mis->postcopy_downtime, split_duration_and_fill_points,
+                   &dp_ctx);
+    g_ptr_array_sort(dp_ctx.downtime_points, compare_downtime);
+
+    for (point_iter = 1; point_iter < dp_ctx.number_of_points;
+         point_iter++) {
+        OverlapDowntime *od = g_ptr_array_index(dp_ctx.downtime_points,
+                point_iter);
+        uint64_t *cur_cpus;
+        int smp_cpus_i = smp_cpus;
+        OverlapDowntime *prev_od = g_ptr_array_index(dp_ctx.downtime_points,
+                                                     point_iter - 1);
+        if (!od || !prev_od)
+            continue;
+        /* we need sequence SE */
+        if (!od->is_end || prev_od->is_end)
+            continue;
+
+        cur_cpus = g_memdup(od->cpus, sizeof(uint64_t) * SIZE_TO_KEEP_CPUBITS);
+        for (start_point_iter = point_iter - 1;
+             start_point_iter >= 0 && smp_cpus_i;
+             start_point_iter--, smp_cpus_i--) {
+            OverlapDowntime *t_od = g_ptr_array_index(dp_ctx.downtime_points,
+                                                      start_point_iter);
+            if (!t_od)
+                break;
+            /* should be S */
+            if (t_od->is_end)
+                break;
+
+            /* points were sorted, it's possible when
+             * end is not occured, but this points were ommited
+             * in split_duration_and_fill_points */
+            if (od->tp <= prev_od->tp) {
+                break;
+            }
+
+            for (i = 0; i < SIZE_TO_KEEP_CPUBITS; i++) {
+                cur_cpus[i] |= t_od->cpus[i];
+            }
+
+            /* check_overlap - just count number of bits in cur_cpus,
+             * and compare it with smp_cpus */
+            if (check_overlap(cur_cpus)) {
+                total_downtime += od->tp - prev_od->tp;
+                /* situation when one S point represents all vCPU is possible */
+                break;
+            }
+        }
+        g_free(cur_cpus);
+    }
+    trace_get_postcopy_total_downtime(g_tree_nnodes(mis->postcopy_downtime),
+        total_downtime);
+out:
+    g_ptr_array_free(dp_ctx.downtime_points, TRUE);
+    return total_downtime;
+}
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 70f0480..ea89f4e 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -23,8 +23,10 @@ 
 #include "migration/postcopy-ram.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/balloon.h"
+#include <sys/param.h>
 #include "qemu/error-report.h"
 #include "trace.h"
+#include "glib/glib-helper.h"
 
 /* Arbitrary limit on size of each discard command,
  * keeps them around ~200 bytes
@@ -81,6 +83,11 @@  static bool ufd_version_check(int ufd, MigrationIncomingState *mis)
         return false;
     }
 
+    if (mis && UFFD_FEATURE_THREAD_ID & api_struct.features) {
+        mis->postcopy_downtime = g_tree_new_full(g_int_cmp64,
+                                         NULL, NULL, destroy_downtime_duration);
+    }
+
     if (getpagesize() != ram_pagesize_summary()) {
         bool have_hp = false;
         /* We've got a huge page */
@@ -404,6 +411,18 @@  static int ram_block_enable_notify(const char *block_name, void *host_addr,
     return 0;
 }
 
+static int get_mem_fault_cpu_index(uint32_t pid)
+{
+    CPUState *cpu_iter;
+
+    CPU_FOREACH(cpu_iter) {
+        if (cpu_iter->thread_id == pid)
+           return cpu_iter->cpu_index;
+    }
+    trace_get_mem_fault_cpu_index(pid);
+    return -1;
+}
+
 /*
  * Handle faults detected by the USERFAULT markings
  */
@@ -481,8 +500,10 @@  static void *postcopy_ram_fault_thread(void *opaque)
         rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
         trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
                                                 qemu_ram_get_idstr(rb),
-                                                rb_offset);
+                                                rb_offset, msg.arg.pagefault.feat.ptid);
 
+        mark_postcopy_downtime_begin(msg.arg.pagefault.address,
+                            get_mem_fault_cpu_index(msg.arg.pagefault.feat.ptid));
         /*
          * Send the request to the source - we want to request one
          * of our host page sizes (which is >= TPS)
@@ -577,6 +598,7 @@  int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
 
         return -e;
     }
+    mark_postcopy_downtime_end((uint64_t)host);
 
     trace_postcopy_place_page(host);
     return 0;
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 195fa94..c9f3e47 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -547,7 +547,6 @@  size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
 int qemu_peek_byte(QEMUFile *f, int offset)
 {
     int index = f->buf_index + offset;
-
     assert(!qemu_file_is_writable(f));
     assert(offset < IO_BUF_SIZE);
 
diff --git a/migration/trace-events b/migration/trace-events
index 7372ce2..ab2e1e4 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -110,6 +110,12 @@  process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
 process_incoming_migration_co_postcopy_end_main(void) ""
 migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
 migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  "ioc=%p ioctype=%s hostname=%s"
+mark_postcopy_downtime_begin(uint64_t addr, void *dd, int64_t time, int cpu) "addr 0x%" PRIx64 " dd %p time %" PRId64 " cpu %d"
+mark_postcopy_downtime_end(uint64_t addr, void *dd, int64_t time) "addr 0x%" PRIx64 " dd %p time %" PRId64
+get_postcopy_total_downtime(int num, uint64_t total) "faults %d, total downtime %" PRIu64
+split_duration_and_fill_points(int64_t downtime, uint64_t addr) "downtime %" PRId64 " addr 0x%" PRIx64
+downtime_per_cpu(int cpu_index, int downtime) "downtime cpu[%d]=%d"
+source_return_path_thread_downtime(uint64_t downtime) "downtime %" PRIu64
 
 # migration/rdma.c
 qemu_rdma_accept_incoming_migration(void) ""
@@ -186,7 +192,7 @@  postcopy_ram_enable_notify(void) ""
 postcopy_ram_fault_thread_entry(void) ""
 postcopy_ram_fault_thread_exit(void) ""
 postcopy_ram_fault_thread_quit(void) ""
-postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, int pid) "Request for HVA=%" PRIx64 " rb=%s offset=%zx %d"
 postcopy_ram_incoming_cleanup_closeuf(void) ""
 postcopy_ram_incoming_cleanup_entry(void) ""
 postcopy_ram_incoming_cleanup_exit(void) ""
@@ -195,6 +201,7 @@  save_xbzrle_page_skipping(void) ""
 save_xbzrle_page_overflow(void) ""
 ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
 ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
+get_mem_fault_cpu_index(uint32_t pid) "pid %u is not vCPU"
 
 # migration/exec.c
 migration_exec_outgoing(const char *cmd) "cmd=%s"