[{"id":1772741,"web_url":"http://patchwork.ozlabs.org/comment/1772741/","msgid":"<20170921115746.GB2717@work-vm>","list_archive_url":null,"date":"2017-09-21T11:57:47","subject":"Re: [Qemu-devel] [PATCH v10 07/10] migration: calculate vCPU\n\tblocktime on dst side","submitter":{"id":48102,"url":"http://patchwork.ozlabs.org/api/people/48102/","name":"Dr. David Alan Gilbert","email":"dgilbert@redhat.com"},"content":"* Alexey Perevalov (a.perevalov@samsung.com) wrote:\n> This patch provides blocktime calculation per vCPU,\n> as a summary and as a overlapped value for all vCPUs.\n> \n> This approach was suggested by Peter Xu, as an improvements of\n> previous approch where QEMU kept tree with faulted page address and cpus bitmask\n> in it. Now QEMU is keeping array with faulted page address as value and vCPU\n> as index. It helps to find proper vCPU at UFFD_COPY time. Also it keeps\n> list for blocktime per vCPU (could be traced with page_fault_addr)\n> \n> Blocktime will not calculated if postcopy_blocktime field of\n> MigrationIncomingState wasn't initialized.\n> \n> Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>\n> ---\n>  migration/postcopy-ram.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++-\n>  migration/trace-events   |   5 +-\n>  2 files changed, 140 insertions(+), 3 deletions(-)\n> \n> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c\n> index cc78981..9a5133f 100644\n> --- a/migration/postcopy-ram.c\n> +++ b/migration/postcopy-ram.c\n> @@ -110,7 +110,6 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void)\n>  \n>      ctx->exit_notifier.notify = migration_exit_cb;\n>      qemu_add_exit_notifier(&ctx->exit_notifier);\n> -    add_migration_state_change_notifier(&ctx->postcopy_notifier);\n>      return ctx;\n>  }\n>  \n> @@ -559,6 +558,136 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,\n>      return 0;\n>  }\n>  \n> +static int get_mem_fault_cpu_index(uint32_t pid)\n> +{\n> +    CPUState *cpu_iter;\n> +\n> +    CPU_FOREACH(cpu_iter) {\n> +        if (cpu_iter->thread_id == pid) {\n> +            trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);\n> +            return cpu_iter->cpu_index;\n> +        }\n> +    }\n> +    trace_get_mem_fault_cpu_index(-1, pid);\n> +    return -1;\n> +}\n> +\n> +/*\n> + * This function is being called when pagefault occurs. It\n> + * tracks down vCPU blocking time.\n> + *\n> + * @addr: faulted host virtual address\n> + * @ptid: faulted process thread id\n> + * @rb: ramblock appropriate to addr\n> + */\n> +static void mark_postcopy_blocktime_begin(uint64_t addr, uint32_t ptid,\n> +                                          RAMBlock *rb)\n> +{\n> +    int cpu, already_received;\n> +    MigrationIncomingState *mis = migration_incoming_get_current();\n> +    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;\n> +    int64_t now_ms;\n> +\n> +    if (!dc || ptid == 0) {\n> +        return;\n> +    }\n> +    cpu = get_mem_fault_cpu_index(ptid);\n> +    if (cpu < 0) {\n> +        return;\n> +    }\n> +\n> +    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);\n> +    if (dc->vcpu_addr[cpu] == 0) {\n> +        atomic_inc(&dc->smp_cpus_down);\n> +    }\n> +\n> +    atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr);\n> +    atomic_xchg__nocheck(&dc->last_begin, now_ms);\n> +    atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms);\n> +\n> +    already_received = ramblock_recv_bitmap_test(rb, (void *)addr);\n> +    if (already_received) {\n> +        atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0);\n> +        atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0);\n> +        atomic_sub(&dc->smp_cpus_down, 1);\n> +    }\n> +    trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],\n> +                                        cpu, already_received);\n> +}\n> +\n> +/*\n> + *  This function just provide calculated blocktime per cpu and trace it.\n> + *  Total blocktime is calculated in mark_postcopy_blocktime_end.\n> + *\n> + *\n> + * Assume we have 3 CPU\n> + *\n> + *      S1        E1           S1               E1\n> + * -----***********------------xxx***************------------------------> CPU1\n> + *\n> + *             S2                E2\n> + * ------------****************xxx---------------------------------------> CPU2\n> + *\n> + *                         S3            E3\n> + * ------------------------****xxx********-------------------------------> CPU3\n> + *\n> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1\n> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3\n> + * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -\n> + *            it's a part of total blocktime.\n> + * S1 - here is last_begin\n> + * Legend of the picture is following:\n> + *              * - means blocktime per vCPU\n> + *              x - means overlapped blocktime (total blocktime)\n> + *\n> + * @addr: host virtual address\n> + */\n> +static void mark_postcopy_blocktime_end(uint64_t addr)\n> +{\n> +    MigrationIncomingState *mis = migration_incoming_get_current();\n> +    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;\n> +    int i, affected_cpu = 0;\n> +    int64_t now_ms;\n> +    bool vcpu_total_blocktime = false;\n> +\n> +    if (!dc) {\n> +        return;\n> +    }\n> +\n> +    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);\n> +\n> +    /* lookup cpu, to clear it,\n> +     * that algorithm looks straighforward, but it's not\n> +     * optimal, more optimal algorithm is keeping tree or hash\n> +     * where key is address value is a list of  */\n> +    for (i = 0; i < smp_cpus; i++) {\n> +        uint64_t vcpu_blocktime = 0;\n> +        if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr) {\n> +            continue;\n> +        }\n> +        atomic_xchg__nocheck(&dc->vcpu_addr[i], 0);\n> +        vcpu_blocktime = now_ms -\n> +            atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);\n> +        affected_cpu += 1;\n> +        /* we need to know is that mark_postcopy_end was due to\n> +         * faulted page, another possible case it's prefetched\n> +         * page and in that case we shouldn't be here */\n> +        if (!vcpu_total_blocktime &&\n> +            atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {\n> +            vcpu_total_blocktime = true;\n> +        }\n> +        /* continue cycle, due to one page could affect several vCPUs */\n> +        dc->vcpu_blocktime[i] += vcpu_blocktime;\n> +    }\n\nUnfortunately this still isn't thread safe; consider the code in\nmark_postcopy_blocktime_begin is:\n\n 1 check vcpu_addr\n 2 write vcpu_addr\n 3 write last_begin\n 4 write vcpu_time\n 5 smp_cpus_down++\n\n 6  already_received:\n 7     write addr = 0\n 8     write vcpu_time = 0\n 9     smp_cpus_down--\n\nand this code is:\n a check vcpu_addr\n b write vcpu_addr\n c read vcpu_time\n d read smp_cpus_down\n\n e dec smp_cpus_down\n\nif (a) happens after (2) but before (3), (c) and (d) can also\nhappen before (3), and so you end up reading a bogus\nvcpu_time.\n\nThis is tricky to get right; if you changed the source to do:\n 1 check vcpu_addr\n 3 write last_begin\n 4 write vcpu_time\n 5 smp_cpus_down++\n 2 write vcpu_addr\n\n 6  already_received:\n 7     write addr = 0\n 8     write vcpu_time = 0\n 9     smp_cpus_down--\n\nI think it's safer;  if you read a good vcpu_addr you know\nthat the vcpu_time has already been written.\n\nHowever, can this check (a) happen between  the new (2) and (7) ?\nIt's slim but I think possibly; on the receiving side we've\njust set the bitmap flag to say received - if a fault comes\nin at about the same time then we could end up with\n\n1,3,4,5,2 ab 6 7 8 9 cde\n\nSo again we end up reading a bogus vcpu_time and double decrement\nsmp_cpus_down.\n\nSo I think we have to have:\n\n  a'  read vcpu_addr\n  b'  read vcpu_time\n  c'  if vcpu_addr == addr && vcpu_time != 0 ...\n  d'    clear vcpu_addr\n  e'    read/dec smp_cpus_down\n\nYou should comment to say where the order is important as well;\nbecause we'll never remember this - it's hairy!\n(Better suggestions welcome)\n\nDave\n\n> +    atomic_sub(&dc->smp_cpus_down, affected_cpu);\n> +    if (vcpu_total_blocktime) {\n> +        dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0);\n> +    }\n> +    trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,\n> +                                      affected_cpu);\n> +}\n> +\n>  /*\n>   * Handle faults detected by the USERFAULT markings\n>   */\n> @@ -636,8 +765,11 @@ static void *postcopy_ram_fault_thread(void *opaque)\n>          rb_offset &= ~(qemu_ram_pagesize(rb) - 1);\n>          trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,\n>                                                  qemu_ram_get_idstr(rb),\n> -                                                rb_offset);\n> +                                                rb_offset,\n> +                                                msg.arg.pagefault.feat.ptid);\n>  \n> +        mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address),\n> +                                      msg.arg.pagefault.feat.ptid, rb);\n>          /*\n>           * Send the request to the source - we want to request one\n>           * of our host page sizes (which is >= TPS)\n> @@ -727,6 +859,8 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,\n>      if (!ret) {\n>          ramblock_recv_bitmap_set_range(rb, host_addr,\n>                                         pagesize / qemu_target_page_size());\n> +        mark_postcopy_blocktime_end((uint64_t)(uintptr_t)host_addr);\n> +\n>      }\n>      return ret;\n>  }\n> diff --git a/migration/trace-events b/migration/trace-events\n> index d2910a6..01f30fe 100644\n> --- a/migration/trace-events\n> +++ b/migration/trace-events\n> @@ -114,6 +114,8 @@ process_incoming_migration_co_end(int ret, int ps) \"ret=%d postcopy-state=%d\"\n>  process_incoming_migration_co_postcopy_end_main(void) \"\"\n>  migration_set_incoming_channel(void *ioc, const char *ioctype) \"ioc=%p ioctype=%s\"\n>  migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  \"ioc=%p ioctype=%s hostname=%s\"\n> +mark_postcopy_blocktime_begin(uint64_t addr, void *dd, int64_t time, int cpu, int received) \"addr: 0x%\" PRIx64 \", dd: %p, time: %\" PRId64 \", cpu: %d, already_received: %d\"\n> +mark_postcopy_blocktime_end(uint64_t addr, void *dd, int64_t time, int affected_cpu) \"addr: 0x%\" PRIx64 \", dd: %p, time: %\" PRId64 \", affected_cpu: %d\"\n>  \n>  # migration/rdma.c\n>  qemu_rdma_accept_incoming_migration(void) \"\"\n> @@ -190,7 +192,7 @@ postcopy_ram_enable_notify(void) \"\"\n>  postcopy_ram_fault_thread_entry(void) \"\"\n>  postcopy_ram_fault_thread_exit(void) \"\"\n>  postcopy_ram_fault_thread_quit(void) \"\"\n> -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) \"Request for HVA=0x%\" PRIx64 \" rb=%s offset=0x%zx\"\n> +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) \"Request for HVA=0x%\" PRIx64 \" rb=%s offset=0x%zx pid=%u\"\n>  postcopy_ram_incoming_cleanup_closeuf(void) \"\"\n>  postcopy_ram_incoming_cleanup_entry(void) \"\"\n>  postcopy_ram_incoming_cleanup_exit(void) \"\"\n> @@ -199,6 +201,7 @@ save_xbzrle_page_skipping(void) \"\"\n>  save_xbzrle_page_overflow(void) \"\"\n>  ram_save_iterate_big_wait(uint64_t milliconds, int iterations) \"big wait: %\" PRIu64 \" milliseconds, %d iterations\"\n>  ram_load_complete(int ret, uint64_t seq_iter) \"exit_code %d seq iteration %\" PRIu64\n> +get_mem_fault_cpu_index(int cpu, uint32_t pid) \"cpu: %d, pid: %u\"\n>  \n>  # migration/exec.c\n>  migration_exec_outgoing(const char *cmd) \"cmd=%s\"\n> -- \n> 1.9.1\n> \n--\nDr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK","headers":{"Return-Path":"<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>","X-Original-To":"incoming@patchwork.ozlabs.org","Delivered-To":"patchwork-incoming@bilbo.ozlabs.org","Authentication-Results":["ozlabs.org;\n\tspf=pass (mailfrom) smtp.mailfrom=nongnu.org\n\t(client-ip=2001:4830:134:3::11; helo=lists.gnu.org;\n\tenvelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n\treceiver=<UNKNOWN>)","ext-mx08.extmail.prod.ext.phx2.redhat.com;\n\tdmarc=none (p=none dis=none) header.from=redhat.com","ext-mx08.extmail.prod.ext.phx2.redhat.com;\n\tspf=fail smtp.mailfrom=dgilbert@redhat.com"],"Received":["from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11])\n\t(using TLSv1 with cipher AES256-SHA (256/256 bits))\n\t(No client certificate requested)\n\tby ozlabs.org (Postfix) with ESMTPS id 3xyZrJ4fJsz9t42\n\tfor <incoming@patchwork.ozlabs.org>;\n\tThu, 21 Sep 2017 21:58:23 +1000 (AEST)","from localhost ([::1]:53085 helo=lists.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.71) (envelope-from\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)\n\tid 1dv07Y-0006kB-JH\n\tfor incoming@patchwork.ozlabs.org; Thu, 21 Sep 2017 07:58:20 -0400","from eggs.gnu.org ([2001:4830:134:3::10]:36259)\n\tby lists.gnu.org with esmtp (Exim 4.71)\n\t(envelope-from <dgilbert@redhat.com>) id 1dv07E-0006jv-Du\n\tfor qemu-devel@nongnu.org; Thu, 21 Sep 2017 07:58:02 -0400","from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)\n\t(envelope-from <dgilbert@redhat.com>) id 1dv079-0004Pk-I2\n\tfor qemu-devel@nongnu.org; Thu, 21 Sep 2017 07:58:00 -0400","from mx1.redhat.com ([209.132.183.28]:55436)\n\tby eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32)\n\t(Exim 4.71) (envelope-from <dgilbert@redhat.com>) id 1dv079-0004Oy-8C\n\tfor qemu-devel@nongnu.org; Thu, 21 Sep 2017 07:57:55 -0400","from smtp.corp.redhat.com\n\t(int-mx04.intmail.prod.int.phx2.redhat.com [10.5.11.14])\n\t(using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits))\n\t(No client certificate requested)\n\tby mx1.redhat.com (Postfix) with ESMTPS id D3A84C0546C6;\n\tThu, 21 Sep 2017 11:57:53 +0000 (UTC)","from work-vm (ovpn-117-186.ams2.redhat.com [10.36.117.186])\n\tby smtp.corp.redhat.com (Postfix) with ESMTPS id 820F15D97C;\n\tThu, 21 Sep 2017 11:57:49 +0000 (UTC)"],"DMARC-Filter":"OpenDMARC Filter v1.3.2 mx1.redhat.com D3A84C0546C6","Date":"Thu, 21 Sep 2017 12:57:47 +0100","From":"\"Dr. David Alan Gilbert\" <dgilbert@redhat.com>","To":"Alexey Perevalov <a.perevalov@samsung.com>","Message-ID":"<20170921115746.GB2717@work-vm>","References":"<1505839684-10046-1-git-send-email-a.perevalov@samsung.com>\n\t<CGME20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9@eucas1p2.samsung.com>\n\t<1505839684-10046-8-git-send-email-a.perevalov@samsung.com>","MIME-Version":"1.0","Content-Type":"text/plain; charset=us-ascii","Content-Disposition":"inline","In-Reply-To":"<1505839684-10046-8-git-send-email-a.perevalov@samsung.com>","User-Agent":"Mutt/1.8.3 (2017-05-23)","X-Scanned-By":"MIMEDefang 2.79 on 10.5.11.14","X-Greylist":"Sender IP whitelisted, not delayed by milter-greylist-4.5.16\n\t(mx1.redhat.com [10.5.110.32]);\n\tThu, 21 Sep 2017 11:57:54 +0000 (UTC)","X-detected-operating-system":"by eggs.gnu.org: GNU/Linux 2.2.x-3.x [generic]\n\t[fuzzy]","X-Received-From":"209.132.183.28","Subject":"Re: [Qemu-devel] [PATCH v10 07/10] migration: calculate vCPU\n\tblocktime on dst side","X-BeenThere":"qemu-devel@nongnu.org","X-Mailman-Version":"2.1.21","Precedence":"list","List-Id":"<qemu-devel.nongnu.org>","List-Unsubscribe":"<https://lists.nongnu.org/mailman/options/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>","List-Archive":"<http://lists.nongnu.org/archive/html/qemu-devel/>","List-Post":"<mailto:qemu-devel@nongnu.org>","List-Help":"<mailto:qemu-devel-request@nongnu.org?subject=help>","List-Subscribe":"<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=subscribe>","Cc":"heetae82.ahn@samsung.com, quintela@redhat.com, qemu-devel@nongnu.org,\n\tpeterx@redhat.com, i.maximets@samsung.com","Errors-To":"qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org","Sender":"\"Qemu-devel\"\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>"}},{"id":1773369,"web_url":"http://patchwork.ozlabs.org/comment/1773369/","msgid":"<38158e8f-3f64-7eb3-fa46-673e39b15989@samsung.com>","list_archive_url":null,"date":"2017-09-22T08:47:03","subject":"Re: [Qemu-devel] [PATCH v10 07/10] migration: calculate vCPU\n\tblocktime on dst side","submitter":{"id":18164,"url":"http://patchwork.ozlabs.org/api/people/18164/","name":"Alexey Perevalov","email":"a.perevalov@samsung.com"},"content":"On 09/21/2017 02:57 PM, Dr. David Alan Gilbert wrote:\n> * Alexey Perevalov (a.perevalov@samsung.com) wrote:\n>> This patch provides blocktime calculation per vCPU,\n>> as a summary and as a overlapped value for all vCPUs.\n>>\n>> This approach was suggested by Peter Xu, as an improvements of\n>> previous approch where QEMU kept tree with faulted page address and cpus bitmask\n>> in it. Now QEMU is keeping array with faulted page address as value and vCPU\n>> as index. It helps to find proper vCPU at UFFD_COPY time. Also it keeps\n>> list for blocktime per vCPU (could be traced with page_fault_addr)\n>>\n>> Blocktime will not calculated if postcopy_blocktime field of\n>> MigrationIncomingState wasn't initialized.\n>>\n>> Signed-off-by: Alexey Perevalov<a.perevalov@samsung.com>\n>> ---\n>>   migration/postcopy-ram.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++-\n>>   migration/trace-events   |   5 +-\n>>   2 files changed, 140 insertions(+), 3 deletions(-)\n>>\n>> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c\n>> index cc78981..9a5133f 100644\n>> --- a/migration/postcopy-ram.c\n>> +++ b/migration/postcopy-ram.c\n>> @@ -110,7 +110,6 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void)\n>>   \n>>       ctx->exit_notifier.notify = migration_exit_cb;\n>>       qemu_add_exit_notifier(&ctx->exit_notifier);\n>> -    add_migration_state_change_notifier(&ctx->postcopy_notifier);\n>>       return ctx;\n>>   }\n>>   \n>> @@ -559,6 +558,136 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,\n>>       return 0;\n>>   }\n>>   \n>> +static int get_mem_fault_cpu_index(uint32_t pid)\n>> +{\n>> +    CPUState *cpu_iter;\n>> +\n>> +    CPU_FOREACH(cpu_iter) {\n>> +        if (cpu_iter->thread_id == pid) {\n>> +            trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);\n>> +            return cpu_iter->cpu_index;\n>> +        }\n>> +    }\n>> +    trace_get_mem_fault_cpu_index(-1, pid);\n>> +    return -1;\n>> +}\n>> +\n>> +/*\n>> + * This function is being called when pagefault occurs. It\n>> + * tracks down vCPU blocking time.\n>> + *\n>> + * @addr: faulted host virtual address\n>> + * @ptid: faulted process thread id\n>> + * @rb: ramblock appropriate to addr\n>> + */\n>> +static void mark_postcopy_blocktime_begin(uint64_t addr, uint32_t ptid,\n>> +                                          RAMBlock *rb)\n>> +{\n>> +    int cpu, already_received;\n>> +    MigrationIncomingState *mis = migration_incoming_get_current();\n>> +    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;\n>> +    int64_t now_ms;\n>> +\n>> +    if (!dc || ptid == 0) {\n>> +        return;\n>> +    }\n>> +    cpu = get_mem_fault_cpu_index(ptid);\n>> +    if (cpu < 0) {\n>> +        return;\n>> +    }\n>> +\n>> +    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);\n>> +    if (dc->vcpu_addr[cpu] == 0) {\n>> +        atomic_inc(&dc->smp_cpus_down);\n>> +    }\n>> +\n>> +    atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr);\n>> +    atomic_xchg__nocheck(&dc->last_begin, now_ms);\n>> +    atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms);\n>> +\n>> +    already_received = ramblock_recv_bitmap_test(rb, (void *)addr);\n>> +    if (already_received) {\n>> +        atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0);\n>> +        atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0);\n>> +        atomic_sub(&dc->smp_cpus_down, 1);\n>> +    }\n>> +    trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],\n>> +                                        cpu, already_received);\n>> +}\n>> +\n>> +/*\n>> + *  This function just provide calculated blocktime per cpu and trace it.\n>> + *  Total blocktime is calculated in mark_postcopy_blocktime_end.\n>> + *\n>> + *\n>> + * Assume we have 3 CPU\n>> + *\n>> + *      S1        E1           S1               E1\n>> + * -----***********------------xxx***************------------------------> CPU1\n>> + *\n>> + *             S2                E2\n>> + * ------------****************xxx---------------------------------------> CPU2\n>> + *\n>> + *                         S3            E3\n>> + * ------------------------****xxx********-------------------------------> CPU3\n>> + *\n>> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1\n>> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3\n>> + * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -\n>> + *            it's a part of total blocktime.\n>> + * S1 - here is last_begin\n>> + * Legend of the picture is following:\n>> + *              * - means blocktime per vCPU\n>> + *              x - means overlapped blocktime (total blocktime)\n>> + *\n>> + * @addr: host virtual address\n>> + */\n>> +static void mark_postcopy_blocktime_end(uint64_t addr)\n>> +{\n>> +    MigrationIncomingState *mis = migration_incoming_get_current();\n>> +    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;\n>> +    int i, affected_cpu = 0;\n>> +    int64_t now_ms;\n>> +    bool vcpu_total_blocktime = false;\n>> +\n>> +    if (!dc) {\n>> +        return;\n>> +    }\n>> +\n>> +    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);\n>> +\n>> +    /* lookup cpu, to clear it,\n>> +     * that algorithm looks straighforward, but it's not\n>> +     * optimal, more optimal algorithm is keeping tree or hash\n>> +     * where key is address value is a list of  */\n>> +    for (i = 0; i < smp_cpus; i++) {\n>> +        uint64_t vcpu_blocktime = 0;\n>> +        if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr) {\n>> +            continue;\n>> +        }\n>> +        atomic_xchg__nocheck(&dc->vcpu_addr[i], 0);\n>> +        vcpu_blocktime = now_ms -\n>> +            atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);\n>> +        affected_cpu += 1;\n>> +        /* we need to know is that mark_postcopy_end was due to\n>> +         * faulted page, another possible case it's prefetched\n>> +         * page and in that case we shouldn't be here */\n>> +        if (!vcpu_total_blocktime &&\n>> +            atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {\n>> +            vcpu_total_blocktime = true;\n>> +        }\n>> +        /* continue cycle, due to one page could affect several vCPUs */\n>> +        dc->vcpu_blocktime[i] += vcpu_blocktime;\n>> +    }\n> Unfortunately this still isn't thread safe; consider the code in\n> mark_postcopy_blocktime_begin is:\n>\n>   1 check vcpu_addr\n>   2 write vcpu_addr\n>   3 write last_begin\n>   4 write vcpu_time\n>   5 smp_cpus_down++\nI think sequence is following:\n1 check vcpu_addr\n     1.1 smp_cpus_down++\n2 write vcpu_addr\n3 write last_begin\n4 write vcpu_time\n\n>\n>   6  already_received:\n>   7     write addr = 0\n>   8     write vcpu_time = 0\n>   9     smp_cpus_down--\n>\n> and this code is:\n>   a check vcpu_addr\nthat check is part of lookup, you assume, lookup completed successfully,\nbut data for that lookup is not yet set.\n>   b write vcpu_addr\n>   c read vcpu_time\n>   d read smp_cpus_down\n>\n>   e dec smp_cpus_down\n>\n> if (a) happens after (2) but before (3), (c) and (d) can also\n> happen before (3), and so you end up reading a bogus\n> vcpu_time.\nyes in this case total_blocktime will be longer\n>\n> This is tricky to get right; if you changed the source to do:\n>   1 check vcpu_addr\n>   3 write last_begin\n>   4 write vcpu_time\n>   5 smp_cpus_down++\n>   2 write vcpu_addr\n>\n>   6  already_received:\n>   7     write addr = 0\n>   8     write vcpu_time = 0\n>   9     smp_cpus_down--\n>\n> I think it's safer;  if you read a good vcpu_addr you know\n> that the vcpu_time has already been written.\n>\n> However, can this check (a) happen between  the new (2) and (7) ?\n> It's slim but I think possibly; on the receiving side we've\n> just set the bitmap flag to say received - if a fault comes\n> in at about the same time then we could end up with\n>\n> 1,3,4,5,2 ab 6 7 8 9 cde\n>\n> So again we end up reading a bogus vcpu_time and double decrement\n> smp_cpus_down.\nmm, double decrement will break this feature till the end of\nmigration.\n\nas I remember I offered second bitmap, in v6 maybe.\n>\n> So I think we have to have:\n>\n>    a'  read vcpu_addr\n>    b'  read vcpu_time\n>    c'  if vcpu_addr == addr && vcpu_time != 0 ...\n>    d'    clear vcpu_addr\n>    e'    read/dec smp_cpus_down\n>\n> You should comment to say where the order is important as well;\n> because we'll never remember this - it's hairy!\n> (Better suggestions welcome)\nprotect whole dc by mutex is not considered due to it's hot path.","headers":{"Return-Path":"<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>","X-Original-To":"incoming@patchwork.ozlabs.org","Delivered-To":"patchwork-incoming@bilbo.ozlabs.org","Authentication-Results":"ozlabs.org;\n\tspf=pass (mailfrom) smtp.mailfrom=nongnu.org\n\t(client-ip=2001:4830:134:3::11; helo=lists.gnu.org;\n\tenvelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n\treceiver=<UNKNOWN>)","Received":["from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11])\n\t(using TLSv1 with cipher AES256-SHA (256/256 bits))\n\t(No client certificate requested)\n\tby ozlabs.org (Postfix) with ESMTPS id 3xz6Yl1rgWz9t16\n\tfor <incoming@patchwork.ozlabs.org>;\n\tFri, 22 Sep 2017 18:47:39 +1000 (AEST)","from localhost ([::1]:57391 helo=lists.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.71) (envelope-from\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)\n\tid 1dvJcX-0002Gn-CZ\n\tfor incoming@patchwork.ozlabs.org; Fri, 22 Sep 2017 04:47:37 -0400","from eggs.gnu.org ([2001:4830:134:3::10]:53802)\n\tby lists.gnu.org with esmtp (Exim 4.71)\n\t(envelope-from <a.perevalov@samsung.com>) id 1dvJcB-0002Gi-LO\n\tfor qemu-devel@nongnu.org; Fri, 22 Sep 2017 04:47:17 -0400","from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)\n\t(envelope-from <a.perevalov@samsung.com>) id 1dvJc8-0008Qq-Ew\n\tfor qemu-devel@nongnu.org; Fri, 22 Sep 2017 04:47:15 -0400","from mailout1.w1.samsung.com ([210.118.77.11]:47681)\n\tby eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32)\n\t(Exim 4.71) (envelope-from <a.perevalov@samsung.com>)\n\tid 1dvJc8-0008Pi-4S\n\tfor qemu-devel@nongnu.org; Fri, 22 Sep 2017 04:47:12 -0400","from eucas1p2.samsung.com (unknown [182.198.249.207])\n\tby mailout1.w1.samsung.com (KnoxPortal) with ESMTP id\n\t20170922084706euoutp013bbd2488e0671f18b227a43bb099d8e0~morW5V6m10210302103euoutp01W;\n\tFri, 22 Sep 2017 08:47:06 +0000 (GMT)","from eusmges3.samsung.com (unknown [203.254.199.242]) by\n\teucas1p1.samsung.com (KnoxPortal) with ESMTP id\n\t20170922084706eucas1p1d381fc368956fc3fcfe7adb888d79a7d~morWNfhEi3087530875eucas1p1S;\n\tFri, 22 Sep 2017 08:47:06 +0000 (GMT)","from eucas1p1.samsung.com ( [182.198.249.206]) by\n\teusmges3.samsung.com (EUCPMTA) with SMTP id 75.7D.12867.90EC4C95;\n\tFri, 22 Sep 2017 09:47:05 +0100 (BST)","from eusmgms2.samsung.com (unknown [182.198.249.180]) by\n\teucas1p1.samsung.com (KnoxPortal) with ESMTP id\n\t20170922084705eucas1p1db28872da71401b1a4bd0161e9fdb16d~morVcY7Im3076730767eucas1p1A;\n\tFri, 22 Sep 2017 08:47:05 +0000 (GMT)","from eusync4.samsung.com ( [203.254.199.214]) by\n\teusmgms2.samsung.com (EUCPMTA) with SMTP id BF.36.20118.90EC4C95;\n\tFri, 22 Sep 2017 09:47:05 +0100 (BST)","from [106.109.129.199] by eusync4.samsung.com (Oracle\n\tCommunications Messaging Server 7.0.5.31.0 64bit (built May 5 2014))\n\twith ESMTPA id <0OWO006UAB2G2EA0@eusync4.samsung.com>;\n\tFri, 22 Sep 2017 09:47:05 +0100 (BST)"],"X-AuditID":"cbfec7f2-f793b6d000003243-90-59c4ce09321e","From":"Alexey Perevalov <a.perevalov@samsung.com>","To":"\"Dr. David Alan Gilbert\" <dgilbert@redhat.com>","Message-id":"<38158e8f-3f64-7eb3-fa46-673e39b15989@samsung.com>","Date":"Fri, 22 Sep 2017 11:47:03 +0300","User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101\n\tThunderbird/52.2.1","MIME-version":"1.0","In-reply-to":"<20170921115746.GB2717@work-vm>","Content-type":"text/plain; charset=\"utf-8\"; format=\"flowed\"","Content-transfer-encoding":"7bit","Content-language":"en-GB","X-Brightmail-Tracker":["H4sIAAAAAAAAA02Sa0hTYRjHeXfOtrPl6nWaPligDPpQlNmNDpJW5IcFkdanJkitdpjDTWVT\n\tyT7EQnQ272ktp9hdyLxQm5t9yHDOOS9JZLPMmHihRBNLM7Oh1nYM9u33PM//z/P8X16KENdw\n\toyhVZg6jzZSrJTwhaXOtDu0TDDllcbWFcXSZzcunq+bbuPSdpTE+/d6wyqetr3/x6d6yDpL+\n\tbC3nnOBLp0csHOlCp4cnLbc2oRQiVXhMwahVeYx2f+IlYfryOxc3u0Z+tc6+iPTIKTUiAQX4\n\tMOj720mWI+Ctt41nREJKjJ8gMDxsJfwDMV5C0D6d898w2dBDsKJGBEumAg5bzCBoXHuGjIii\n\teDgOOr0JfkMYToFm55tAOxwfBNdqjF9O4GIEFnMF168R4URYbBwJXEHiXeDyTAV4O74AtpW7\n\tPFYTCr+rvYG+AO+F79165GcCx8OX9UIuy9FgaZ4nWI4E1/go8i8D7ObBRN0cl02QBIMVpk0O\n\tg9leK5/lnTBcXUKyBgOCkpc3uWxRiWDQtLapOg79ox4Ou2Ir3LKZCH80wCIoLhKzKIUP07Gs\n\t+iQ8WhjZfFIfglrDFFGJos1BgcxBIcxBIcxBIe4jsgmFM7k6jZLRHYrVyTW63Exl7JUszQv0\n\t768MrPcudqBld7wDYQpJQkS0u1sm5srzdPkaBwKKkISL0IBTJhYp5PnXGG3WRW2umtE50A6K\n\tlESKElKLZGKslOcwGQyTzWj/TzmUIEqP0oZLvs5LW0L/mOsz3Mpx28Q9xWmL60dfzTCdvLFq\n\t2p0aM3nu1Witynij5XKEayWhVFFv6EpOqxqb8R15MHh0Q1WU/O367Y99DqZAI/7ke8zrUGwh\n\t7WdCGrZ1eexni+dKk9Z7DKda7fx4DLwYk/X5eUnSU9o2O7Sm/lnhbPNJSF26/MAeQquT/wWr\n\tXUlCJwMAAA==","H4sIAAAAAAAAA+NgFnrBLMWRmVeSWpSXmKPExsVy+t/xa7qc545EGnTN47Ho3XaP3WLi2/Ws\n\tFtM+32a3uNL+k91iy/5v7BbHe3ewWNzZ0sfkwO7x5NpmJo/3+66yefRtWcUYwBzFZZOSmpNZ\n\tllqkb5fAlfH10jHWgimJFbO3f2JsYDzi0cXIySEhYCLxaN5RZghbTOLCvfVsXYxcHEICSxgl\n\tPnedYoVwXjBK3O/9ydjFyMHBJmAgse+eLYgpLOAnMaVNHsQUETCSOPZTAaSaWaCDUeL7pT3s\n\tIDOFBH4zSkxaYwli8wrYSXxado0FxGYRUJU4dvUxmC0qECHR9/YyO0SNoMSPyffA4pwCOhIf\n\tDjcwgtjMAmYSX14eZoWw5SU2r3nLDGGLSxy7f5NxAqPgLCTts5C0zELSMgtJywJGllWMIqml\n\txbnpucVGesWJucWleel6yfm5mxiB4b/t2M8tOxi73gUfYhTgYFTi4b2x73CkEGtiWXFl7iFG\n\tCQ5mJRFextNHIoV4UxIrq1KL8uOLSnNSiw8xSnOwKInz9u5ZHSkkkJ5YkpqdmlqQWgSTZeLg\n\tlGpgtNmybk3c1j+n/35irz5Qxz/p6Q2rqRk+06fJfLuZLObcGXAk8fLHOBMXAws9Pu17u/91\n\t6Ngo9cbv1OKxYJs5fXHHp0vBEade5oXzxkubFP8Umuh8+/nen1yxfX9mTujhTrbRWyIokxrd\n\tWO57r/9x/tzmZZ85sw8sO3J1m/t5z6pW5upZJ2SXK7EUZyQaajEXFScCAMAmv3d7AgAA"],"X-CMS-MailID":"20170922084705eucas1p1db28872da71401b1a4bd0161e9fdb16d","X-Msg-Generator":"CA","X-Sender-IP":"182.198.249.180","X-Local-Sender":"=?utf-8?q?Alexey_Perevalov=1BSRR-Virtualization_Lab=1B?=\n\t=?utf-8?b?7IK87ISx7KCE7J6QG1NlbmlvciBFbmdpbmVlcg==?=","X-Global-Sender":"=?utf-8?q?Alexey_Perevalov=1BSRR-Virtualization_Lab=1BSa?=\n\t=?utf-8?q?msung_Electronics=1BSenior_Engineer?=","X-Sender-Code":"=?utf-8?q?C10=1BCISHQ=1BC10GD01GD010154?=","CMS-TYPE":"201P","X-CMS-RootMailID":"20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9","X-RootMTR":"20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9","References":"<1505839684-10046-1-git-send-email-a.perevalov@samsung.com>\n\t<CGME20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9@eucas1p2.samsung.com>\n\t<1505839684-10046-8-git-send-email-a.perevalov@samsung.com>\n\t<20170921115746.GB2717@work-vm>","X-detected-operating-system":"by eggs.gnu.org: GNU/Linux 3.x [fuzzy]","X-Received-From":"210.118.77.11","Subject":"Re: [Qemu-devel] [PATCH v10 07/10] migration: calculate vCPU\n\tblocktime on dst side","X-BeenThere":"qemu-devel@nongnu.org","X-Mailman-Version":"2.1.21","Precedence":"list","List-Id":"<qemu-devel.nongnu.org>","List-Unsubscribe":"<https://lists.nongnu.org/mailman/options/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>","List-Archive":"<http://lists.nongnu.org/archive/html/qemu-devel/>","List-Post":"<mailto:qemu-devel@nongnu.org>","List-Help":"<mailto:qemu-devel-request@nongnu.org?subject=help>","List-Subscribe":"<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=subscribe>","Cc":"heetae82.ahn@samsung.com, quintela@redhat.com, qemu-devel@nongnu.org,\n\tpeterx@redhat.com, i.maximets@samsung.com","Errors-To":"qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org","Sender":"\"Qemu-devel\"\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>"}},{"id":1776845,"web_url":"http://patchwork.ozlabs.org/comment/1776845/","msgid":"<00d3ead7-56e5-1093-bb8b-b6471192bf73@samsung.com>","list_archive_url":null,"date":"2017-09-28T08:01:01","subject":"Re: [Qemu-devel] [PATCH v10 07/10] migration: calculate vCPU\n\tblocktime on dst side","submitter":{"id":18164,"url":"http://patchwork.ozlabs.org/api/people/18164/","name":"Alexey Perevalov","email":"a.perevalov@samsung.com"},"content":"On 09/21/2017 02:57 PM, Dr. David Alan Gilbert wrote:\n> * Alexey Perevalov (a.perevalov@samsung.com) wrote:\n>> This patch provides blocktime calculation per vCPU,\n>> as a summary and as a overlapped value for all vCPUs.\n>>\n>> This approach was suggested by Peter Xu, as an improvements of\n>> previous approch where QEMU kept tree with faulted page address and cpus bitmask\n>> in it. Now QEMU is keeping array with faulted page address as value and vCPU\n>> as index. It helps to find proper vCPU at UFFD_COPY time. Also it keeps\n>> list for blocktime per vCPU (could be traced with page_fault_addr)\n>>\n>> Blocktime will not calculated if postcopy_blocktime field of\n>> MigrationIncomingState wasn't initialized.\n>>\n>> Signed-off-by: Alexey Perevalov <a.perevalov@samsung.com>\n>> ---\n>>   migration/postcopy-ram.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++-\n>>   migration/trace-events   |   5 +-\n>>   2 files changed, 140 insertions(+), 3 deletions(-)\n>>\n>> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c\n>> index cc78981..9a5133f 100644\n>> --- a/migration/postcopy-ram.c\n>> +++ b/migration/postcopy-ram.c\n>> @@ -110,7 +110,6 @@ static struct PostcopyBlocktimeContext *blocktime_context_new(void)\n>>   \n>>       ctx->exit_notifier.notify = migration_exit_cb;\n>>       qemu_add_exit_notifier(&ctx->exit_notifier);\n>> -    add_migration_state_change_notifier(&ctx->postcopy_notifier);\n>>       return ctx;\n>>   }\n>>   \n>> @@ -559,6 +558,136 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr,\n>>       return 0;\n>>   }\n>>   \n>> +static int get_mem_fault_cpu_index(uint32_t pid)\n>> +{\n>> +    CPUState *cpu_iter;\n>> +\n>> +    CPU_FOREACH(cpu_iter) {\n>> +        if (cpu_iter->thread_id == pid) {\n>> +            trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);\n>> +            return cpu_iter->cpu_index;\n>> +        }\n>> +    }\n>> +    trace_get_mem_fault_cpu_index(-1, pid);\n>> +    return -1;\n>> +}\n>> +\n>> +/*\n>> + * This function is being called when pagefault occurs. It\n>> + * tracks down vCPU blocking time.\n>> + *\n>> + * @addr: faulted host virtual address\n>> + * @ptid: faulted process thread id\n>> + * @rb: ramblock appropriate to addr\n>> + */\n>> +static void mark_postcopy_blocktime_begin(uint64_t addr, uint32_t ptid,\n>> +                                          RAMBlock *rb)\n>> +{\n>> +    int cpu, already_received;\n>> +    MigrationIncomingState *mis = migration_incoming_get_current();\n>> +    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;\n>> +    int64_t now_ms;\n>> +\n>> +    if (!dc || ptid == 0) {\n>> +        return;\n>> +    }\n>> +    cpu = get_mem_fault_cpu_index(ptid);\n>> +    if (cpu < 0) {\n>> +        return;\n>> +    }\n>> +\n>> +    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);\n>> +    if (dc->vcpu_addr[cpu] == 0) {\n>> +        atomic_inc(&dc->smp_cpus_down);\n>> +    }\n>> +\n>> +    atomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr);\n>> +    atomic_xchg__nocheck(&dc->last_begin, now_ms);\n>> +    atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], now_ms);\n>> +\n>> +    already_received = ramblock_recv_bitmap_test(rb, (void *)addr);\n>> +    if (already_received) {\n>> +        atomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0);\n>> +        atomic_xchg__nocheck(&dc->page_fault_vcpu_time[cpu], 0);\n>> +        atomic_sub(&dc->smp_cpus_down, 1);\n>> +    }\n>> +    trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],\n>> +                                        cpu, already_received);\n>> +}\n>> +\n>> +/*\n>> + *  This function just provide calculated blocktime per cpu and trace it.\n>> + *  Total blocktime is calculated in mark_postcopy_blocktime_end.\n>> + *\n>> + *\n>> + * Assume we have 3 CPU\n>> + *\n>> + *      S1        E1           S1               E1\n>> + * -----***********------------xxx***************------------------------> CPU1\n>> + *\n>> + *             S2                E2\n>> + * ------------****************xxx---------------------------------------> CPU2\n>> + *\n>> + *                         S3            E3\n>> + * ------------------------****xxx********-------------------------------> CPU3\n>> + *\n>> + * We have sequence S1,S2,E1,S3,S1,E2,E3,E1\n>> + * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3\n>> + * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -\n>> + *            it's a part of total blocktime.\n>> + * S1 - here is last_begin\n>> + * Legend of the picture is following:\n>> + *              * - means blocktime per vCPU\n>> + *              x - means overlapped blocktime (total blocktime)\n>> + *\n>> + * @addr: host virtual address\n>> + */\n>> +static void mark_postcopy_blocktime_end(uint64_t addr)\n>> +{\n>> +    MigrationIncomingState *mis = migration_incoming_get_current();\n>> +    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;\n>> +    int i, affected_cpu = 0;\n>> +    int64_t now_ms;\n>> +    bool vcpu_total_blocktime = false;\n>> +\n>> +    if (!dc) {\n>> +        return;\n>> +    }\n>> +\n>> +    now_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);\n>> +\n>> +    /* lookup cpu, to clear it,\n>> +     * that algorithm looks straighforward, but it's not\n>> +     * optimal, more optimal algorithm is keeping tree or hash\n>> +     * where key is address value is a list of  */\n>> +    for (i = 0; i < smp_cpus; i++) {\n>> +        uint64_t vcpu_blocktime = 0;\n>> +        if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr) {\n>> +            continue;\n>> +        }\n>> +        atomic_xchg__nocheck(&dc->vcpu_addr[i], 0);\n>> +        vcpu_blocktime = now_ms -\n>> +            atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);\n>> +        affected_cpu += 1;\n>> +        /* we need to know is that mark_postcopy_end was due to\n>> +         * faulted page, another possible case it's prefetched\n>> +         * page and in that case we shouldn't be here */\n>> +        if (!vcpu_total_blocktime &&\n>> +            atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {\n>> +            vcpu_total_blocktime = true;\n>> +        }\n>> +        /* continue cycle, due to one page could affect several vCPUs */\n>> +        dc->vcpu_blocktime[i] += vcpu_blocktime;\n>> +    }\n> Unfortunately this still isn't thread safe; consider the code in\n> mark_postcopy_blocktime_begin is:\n>\n>   1 check vcpu_addr\n>   2 write vcpu_addr\n>   3 write last_begin\n>   4 write vcpu_time\n>   5 smp_cpus_down++\n>\n>   6  already_received:\n>   7     write addr = 0\n>   8     write vcpu_time = 0\n>   9     smp_cpus_down--\n>\n> and this code is:\n>   a check vcpu_addr\n>   b write vcpu_addr\n>   c read vcpu_time\n>   d read smp_cpus_down\n>\n>   e dec smp_cpus_down\n>\n> if (a) happens after (2) but before (3), (c) and (d) can also\n> happen before (3), and so you end up reading a bogus\n> vcpu_time.\n>\n> This is tricky to get right; if you changed the source to do:\n>   1 check vcpu_addr\n>   3 write last_begin\n>   4 write vcpu_time\n>   5 smp_cpus_down++\n>   2 write vcpu_addr\n>\n>   6  already_received:\n>   7     write addr = 0\n>   8     write vcpu_time = 0\n>   9     smp_cpus_down--\n>\n> I think it's safer;  if you read a good vcpu_addr you know\n> that the vcpu_time has already been written.\n>\n> However, can this check (a) happen between  the new (2) and (7) ?\n> It's slim but I think possibly; on the receiving side we've\n> just set the bitmap flag to say received - if a fault comes\n> in at about the same time then we could end up with\n>\n> 1,3,4,5,2 ab 6 7 8 9 cde\n>\n> So again we end up reading a bogus vcpu_time and double decrement\n> smp_cpus_down.\n>\n> So I think we have to have:\n>\n>    a'  read vcpu_addr\n>    b'  read vcpu_time\n>    c'  if vcpu_addr == addr && vcpu_time != 0 ...\n>    d'    clear vcpu_addr\n>    e'    read/dec smp_cpus_down\nI think even in this sequence it's possible to have\nlookup condition\n         if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||\n             atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0) == 0) {\n             continue;\n         }\nin you terms it's (not c)\nbetween\natomic_xchg__nocheck(&dc->vcpu_addr[cpu], addr); - (2)\nand\natomic_xchg__nocheck(&dc->vcpu_addr[cpu], 0); - (7)\n2c7\nProbability is lesser, but it still exists.\nHere comes up a second bitmap, but it will increase code complexity.\nBTW, remind me why we don't protect body of these functions by mutex?\n\n>\n> You should comment to say where the order is important as well;\n> because we'll never remember this - it's hairy!\n> (Better suggestions welcome)\n>\n> Dave\n>\n>> +    atomic_sub(&dc->smp_cpus_down, affected_cpu);\n>> +    if (vcpu_total_blocktime) {\n>> +        dc->total_blocktime += now_ms - atomic_fetch_add(&dc->last_begin, 0);\n>> +    }\n>> +    trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,\n>> +                                      affected_cpu);\n>> +}\n>> +\n>>   /*\n>>    * Handle faults detected by the USERFAULT markings\n>>    */\n>> @@ -636,8 +765,11 @@ static void *postcopy_ram_fault_thread(void *opaque)\n>>           rb_offset &= ~(qemu_ram_pagesize(rb) - 1);\n>>           trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,\n>>                                                   qemu_ram_get_idstr(rb),\n>> -                                                rb_offset);\n>> +                                                rb_offset,\n>> +                                                msg.arg.pagefault.feat.ptid);\n>>   \n>> +        mark_postcopy_blocktime_begin((uintptr_t)(msg.arg.pagefault.address),\n>> +                                      msg.arg.pagefault.feat.ptid, rb);\n>>           /*\n>>            * Send the request to the source - we want to request one\n>>            * of our host page sizes (which is >= TPS)\n>> @@ -727,6 +859,8 @@ static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,\n>>       if (!ret) {\n>>           ramblock_recv_bitmap_set_range(rb, host_addr,\n>>                                          pagesize / qemu_target_page_size());\n>> +        mark_postcopy_blocktime_end((uint64_t)(uintptr_t)host_addr);\n>> +\n>>       }\n>>       return ret;\n>>   }\n>> diff --git a/migration/trace-events b/migration/trace-events\n>> index d2910a6..01f30fe 100644\n>> --- a/migration/trace-events\n>> +++ b/migration/trace-events\n>> @@ -114,6 +114,8 @@ process_incoming_migration_co_end(int ret, int ps) \"ret=%d postcopy-state=%d\"\n>>   process_incoming_migration_co_postcopy_end_main(void) \"\"\n>>   migration_set_incoming_channel(void *ioc, const char *ioctype) \"ioc=%p ioctype=%s\"\n>>   migration_set_outgoing_channel(void *ioc, const char *ioctype, const char *hostname)  \"ioc=%p ioctype=%s hostname=%s\"\n>> +mark_postcopy_blocktime_begin(uint64_t addr, void *dd, int64_t time, int cpu, int received) \"addr: 0x%\" PRIx64 \", dd: %p, time: %\" PRId64 \", cpu: %d, already_received: %d\"\n>> +mark_postcopy_blocktime_end(uint64_t addr, void *dd, int64_t time, int affected_cpu) \"addr: 0x%\" PRIx64 \", dd: %p, time: %\" PRId64 \", affected_cpu: %d\"\n>>   \n>>   # migration/rdma.c\n>>   qemu_rdma_accept_incoming_migration(void) \"\"\n>> @@ -190,7 +192,7 @@ postcopy_ram_enable_notify(void) \"\"\n>>   postcopy_ram_fault_thread_entry(void) \"\"\n>>   postcopy_ram_fault_thread_exit(void) \"\"\n>>   postcopy_ram_fault_thread_quit(void) \"\"\n>> -postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) \"Request for HVA=0x%\" PRIx64 \" rb=%s offset=0x%zx\"\n>> +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) \"Request for HVA=0x%\" PRIx64 \" rb=%s offset=0x%zx pid=%u\"\n>>   postcopy_ram_incoming_cleanup_closeuf(void) \"\"\n>>   postcopy_ram_incoming_cleanup_entry(void) \"\"\n>>   postcopy_ram_incoming_cleanup_exit(void) \"\"\n>> @@ -199,6 +201,7 @@ save_xbzrle_page_skipping(void) \"\"\n>>   save_xbzrle_page_overflow(void) \"\"\n>>   ram_save_iterate_big_wait(uint64_t milliconds, int iterations) \"big wait: %\" PRIu64 \" milliseconds, %d iterations\"\n>>   ram_load_complete(int ret, uint64_t seq_iter) \"exit_code %d seq iteration %\" PRIu64\n>> +get_mem_fault_cpu_index(int cpu, uint32_t pid) \"cpu: %d, pid: %u\"\n>>   \n>>   # migration/exec.c\n>>   migration_exec_outgoing(const char *cmd) \"cmd=%s\"\n>> -- \n>> 1.9.1\n>>\n> --\n> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK\n>\n>\n>","headers":{"Return-Path":"<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>","X-Original-To":"incoming@patchwork.ozlabs.org","Delivered-To":"patchwork-incoming@bilbo.ozlabs.org","Authentication-Results":"ozlabs.org;\n\tspf=pass (mailfrom) smtp.mailfrom=nongnu.org\n\t(client-ip=2001:4830:134:3::11; helo=lists.gnu.org;\n\tenvelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n\treceiver=<UNKNOWN>)","Received":["from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11])\n\t(using TLSv1 with cipher AES256-SHA (256/256 bits))\n\t(No client certificate requested)\n\tby ozlabs.org (Postfix) with ESMTPS id 3y2nGB1C7dz9t5x\n\tfor <incoming@patchwork.ozlabs.org>;\n\tThu, 28 Sep 2017 18:01:53 +1000 (AEST)","from localhost ([::1]:57856 helo=lists.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.71) (envelope-from\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)\n\tid 1dxTlV-0000cp-NK\n\tfor incoming@patchwork.ozlabs.org; Thu, 28 Sep 2017 04:01:49 -0400","from eggs.gnu.org ([2001:4830:134:3::10]:41006)\n\tby lists.gnu.org with esmtp (Exim 4.71)\n\t(envelope-from <a.perevalov@samsung.com>) id 1dxTky-0000cj-0N\n\tfor qemu-devel@nongnu.org; Thu, 28 Sep 2017 04:01:18 -0400","from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)\n\t(envelope-from <a.perevalov@samsung.com>) id 1dxTkt-0007nE-48\n\tfor qemu-devel@nongnu.org; Thu, 28 Sep 2017 04:01:16 -0400","from mailout2.w1.samsung.com ([210.118.77.12]:58043)\n\tby eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32)\n\t(Exim 4.71) (envelope-from <a.perevalov@samsung.com>)\n\tid 1dxTks-0007lg-Qt\n\tfor qemu-devel@nongnu.org; Thu, 28 Sep 2017 04:01:11 -0400","from eucas1p2.samsung.com (unknown [182.198.249.207])\n\tby mailout2.w1.samsung.com (KnoxPortal) with ESMTP id\n\t20170928080105euoutp02a7ab1785d5205444adaffaa15aeb4419~od64fZUwS1463514635euoutp02M;\n\tThu, 28 Sep 2017 08:01:05 +0000 (GMT)","from eusmges3.samsung.com (unknown [203.254.199.242]) by\n\teucas1p1.samsung.com (KnoxPortal) with ESMTP id\n\t20170928080104eucas1p163482f4703c2a027585e257921971588~od63znXk81322813228eucas1p1T;\n\tThu, 28 Sep 2017 08:01:04 +0000 (GMT)","from eucas1p1.samsung.com ( [182.198.249.206]) by\n\teusmges3.samsung.com (EUCPMTA) with SMTP id F1.CD.12867.F3CACC95;\n\tThu, 28 Sep 2017 09:01:03 +0100 (BST)","from eusmgms1.samsung.com (unknown [182.198.249.179]) by\n\teucas1p2.samsung.com (KnoxPortal) with ESMTP id\n\t20170928080103eucas1p2a742b9919f276748716a7a9bb6f7f8f8~od62_7hr52907129071eucas1p2B;\n\tThu, 28 Sep 2017 08:01:03 +0000 (GMT)","from eusync4.samsung.com ( [203.254.199.214]) by\n\teusmgms1.samsung.com (EUCPMTA) with SMTP id BF.AC.18832.F3CACC95;\n\tThu, 28 Sep 2017 09:01:03 +0100 (BST)","from [106.109.129.199] by eusync4.samsung.com (Oracle\n\tCommunications Messaging Server 7.0.5.31.0 64bit (built May 5 2014))\n\twith ESMTPA id <0OWZ008GZCXQRM70@eusync4.samsung.com>;\n\tThu, 28 Sep 2017 09:01:03 +0100 (BST)"],"X-AuditID":"cbfec7f2-f793b6d000003243-17-59ccac3ff5cf","MIME-version":"1.0","Content-type":"text/plain; charset=\"utf-8\"; format=\"flowed\"","To":"\"Dr. David Alan Gilbert\" <dgilbert@redhat.com>","From":"Alexey Perevalov <a.perevalov@samsung.com>","Message-id":"<00d3ead7-56e5-1093-bb8b-b6471192bf73@samsung.com>","Date":"Thu, 28 Sep 2017 11:01:01 +0300","User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101\n\tThunderbird/52.3.0","In-reply-to":"<20170921115746.GB2717@work-vm>","Content-language":"en-GB","X-Brightmail-Tracker":["H4sIAAAAAAAAA+NgFprCKsWRmVeSWpSXmKPExsWy7djPc7r2a85EGkz/am3Ru+0eu8XEt+tZ\n\tLaZ9vs1ucaX9J7vFlv3f2C2O9+5gsbizpY/Jgd3jybXNTB7v911l8+jbsooxgDmKyyYlNSez\n\tLLVI3y6BK2P9x2ssBX1JFVNmf2JtYDzm2cXIySEhYCIx98w+RghbTOLCvfVsILaQwFJGiU+f\n\t/LsYuYDsz4wSFy+/Y4dpOLnoMAtEYhmjxL3NO8ASvAKCEj8m32MBsZkFrCSe/WtlhSh6wShx\n\tf8ZXVpCEsECAxJojZ4HWcXCICBhJHPupAFLDLNDBKLF5Vj8rSJxNwEBi3z1biJl2Eu9ONzOB\n\t2CwCqhJrlywAGyMqECFxYdNPsDingI7Eh8MNjBB75SUOXnkOdYO4xLH7NxlB5ksInGCTOLtg\n\tCtSbLhKT99yEsoUlXh3fAvWZjERnx0EmiIZ2RonunZ2sEM4ERokz0/9CVdlLnLp5lQliBZ/E\n\tpG3TmUGulhDglehoE4IwPSSuP9GDqHaUWPz+GhskIH4zSsxsf8w8gVF+FlKAzUIKsFlInpiF\n\t5IkFjCyrGEVSS4tz01OLjfWKE3OLS/PS9ZLzczcxAtPK6X/HP+1g/HrC6hCjAAejEg+vw47T\n\tkUKsiWXFlbmHGCU4mJVEeM9PPRMpxJuSWFmVWpQfX1Sak1p8iFGag0VJnNc2qi1SSCA9sSQ1\n\tOzW1ILUIJsvEwSnVwFhTv0XuAPfU2/sLipqWZflI6nlM331gt+i+Gz8yL28KT+F9G2PG+/Xh\n\tKv1Q1qYr35r1z/jUsSwyecPw3zHp5Mn71wTnmW68cVcoIofJhIUrdM2urMxsV/NlV36YvOWu\n\tWpM75cXjdNsvnpY/y5b/nPvn5fZpSgqzPDgVbmaddH1/dtm2dSa/ur4osRRnJBpqMRcVJwIA\n\tUD8n7icDAAA=","H4sIAAAAAAAAA+NgFnrOLMWRmVeSWpSXmKPExsVy+t/xa7r2a85EGtw7L2zRu+0eu8XEt+tZ\n\tLaZ9vs1ucaX9J7vFlv3f2C2O9+5gsbizpY/Jgd3jybXNTB7v911l8+jbsooxgDmKyyYlNSez\n\tLLVI3y6BK2P9x2ssBX1JFVNmf2JtYDzm2cXIySEhYCJxctFhFghbTOLCvfVsXYxcHEICSxgl\n\t/j39xgSS4BUQlPgx+R5YEbOAmcSXl4dZIYpeMEr8fPabvYuRg0NYwE9iSps8iCkiYCRx7KcC\n\tSAmzQAejxPdLe9hBeoUEfjNKTFpjCVLDJmAgse+eLcR4O4l3p5vBVrEIqEqsXbKAFaREVCBC\n\tYsNGfpAwp4COxIfDDYwQF8hLHLzyHOoacYlj928yTmAUnIXk0FlIDp2FpGUWkpYFjCyrGEVS\n\tS4tz03OLDfWKE3OLS/PS9ZLzczcxAoN/27Gfm3cwXtoYfIhRgINRiYdXY/HpSCHWxLLiytxD\n\tjBIczEoivOennokU4k1JrKxKLcqPLyrNSS0+xCjNwaIkztu7Z3WkkEB6YklqdmpqQWoRTJaJ\n\tg1OqgdH/91TVzl+hC2763BFf3sT/blV+krE6b3Dd/z4hK/d9JX8E6zN2N7b+Kpw4czrT+SsL\n\t7JjlD/f0rUvoWzHpUuxxn52XbY76csy9dso3qLIpQvv73yCLBRq/oi1TCpz4XKazn+2yuWme\n\tbf/4+Q8v8xKbnd9Zki2a7670sDa0Vzqey7puUZbHTyWW4oxEQy3mouJEAMz+/wt6AgAA"],"X-CMS-MailID":"20170928080103eucas1p2a742b9919f276748716a7a9bb6f7f8f8","X-Msg-Generator":"CA","X-Sender-IP":"182.198.249.179","X-Local-Sender":"=?utf-8?q?Alexey_Perevalov=1BSRR-Virtualization_Lab=1B?=\n\t=?utf-8?b?7IK87ISx7KCE7J6QG1NlbmlvciBFbmdpbmVlcg==?=","X-Global-Sender":"=?utf-8?q?Alexey_Perevalov=1BSRR-Virtualization_Lab=1BSa?=\n\t=?utf-8?q?msung_Electronics=1BSenior_Engineer?=","X-Sender-Code":"=?utf-8?q?C10=1BCISHQ=1BC10GD01GD010154?=","CMS-TYPE":"201P","X-CMS-RootMailID":"20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9","X-RootMTR":"20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9","References":"<1505839684-10046-1-git-send-email-a.perevalov@samsung.com>\n\t<CGME20170919164825eucas1p213639e95387bf054270d020f1d7dbfe9@eucas1p2.samsung.com>\n\t<1505839684-10046-8-git-send-email-a.perevalov@samsung.com>\n\t<20170921115746.GB2717@work-vm>","Content-Transfer-Encoding":"quoted-printable","X-detected-operating-system":"by eggs.gnu.org: GNU/Linux 3.x [fuzzy]","X-Received-From":"210.118.77.12","Subject":"Re: [Qemu-devel] [PATCH v10 07/10] migration: calculate vCPU\n\tblocktime on dst side","X-BeenThere":"qemu-devel@nongnu.org","X-Mailman-Version":"2.1.21","Precedence":"list","List-Id":"<qemu-devel.nongnu.org>","List-Unsubscribe":"<https://lists.nongnu.org/mailman/options/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>","List-Archive":"<http://lists.nongnu.org/archive/html/qemu-devel/>","List-Post":"<mailto:qemu-devel@nongnu.org>","List-Help":"<mailto:qemu-devel-request@nongnu.org?subject=help>","List-Subscribe":"<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n\t<mailto:qemu-devel-request@nongnu.org?subject=subscribe>","Cc":"heetae82.ahn@samsung.com, quintela@redhat.com, qemu-devel@nongnu.org,\n\tpeterx@redhat.com, i.maximets@samsung.com","Errors-To":"qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org","Sender":"\"Qemu-devel\"\n\t<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>"}}]