diff mbox series

[RFC,9/9] KVM: Dirty ring support

Message ID 20200205141749.378044-10-peterx@redhat.com
State New
Headers show
Series KVM: Dirty ring support (QEMU part) | expand

Commit Message

Peter Xu Feb. 5, 2020, 2:17 p.m. UTC
KVM dirty ring is a new interface to pass over dirty bits from kernel
to the userspace.  Instead of using a bitmap for each memory region,
the dirty ring contains an array of dirtied GPAs to fetch.  For each
vcpu there will be one dirty ring that binds to it.

There're a few major changes comparing to how the old dirty logging
interface would work:

  - Granularity of dirty bits

    KVM dirty ring interface does not offer memory region level
    granularity to collect dirty bits (i.e., per KVM memory slot).
    Instead the dirty bit is collected globally for all the vcpus at
    once.  The major effect is on VGA part because VGA dirty tracking
    is enabled as long as the device is created, also it was in memory
    region granularity.  Now that operation will be amplified to a VM
    sync.  Maybe there's smarter way to do the same thing in VGA with
    the new interface, but so far I don't see it affects much at least
    on regular VMs.

  - Collection of dirty bits

    The old dirty logging interface collects KVM dirty bits when
    synchronizing dirty bits.  KVM dirty ring interface instead used a
    standalone thread to do that.  So when the other thread (e.g., the
    migration thread) wants to synchronize the dirty bits, it simply
    kick the thread and wait until it flushes all the dirty bits to
    the ramblock dirty bitmap.

For more information please refer to the comments in the code.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 accel/kvm/kvm-all.c    | 426 ++++++++++++++++++++++++++++++++++++++++-
 accel/kvm/trace-events |   7 +
 include/hw/core/cpu.h  |  10 +
 3 files changed, 440 insertions(+), 3 deletions(-)

Comments

Dr. David Alan Gilbert March 25, 2020, 8:41 p.m. UTC | #1
* Peter Xu (peterx@redhat.com) wrote:
> KVM dirty ring is a new interface to pass over dirty bits from kernel
> to the userspace.  Instead of using a bitmap for each memory region,
> the dirty ring contains an array of dirtied GPAs to fetch.  For each
> vcpu there will be one dirty ring that binds to it.
> 
> There're a few major changes comparing to how the old dirty logging
> interface would work:
> 
>   - Granularity of dirty bits
> 
>     KVM dirty ring interface does not offer memory region level
>     granularity to collect dirty bits (i.e., per KVM memory slot).
>     Instead the dirty bit is collected globally for all the vcpus at
>     once.  The major effect is on VGA part because VGA dirty tracking
>     is enabled as long as the device is created, also it was in memory
>     region granularity.  Now that operation will be amplified to a VM
>     sync.  Maybe there's smarter way to do the same thing in VGA with
>     the new interface, but so far I don't see it affects much at least
>     on regular VMs.
> 
>   - Collection of dirty bits
> 
>     The old dirty logging interface collects KVM dirty bits when
>     synchronizing dirty bits.  KVM dirty ring interface instead used a
>     standalone thread to do that.  So when the other thread (e.g., the
>     migration thread) wants to synchronize the dirty bits, it simply
>     kick the thread and wait until it flushes all the dirty bits to
>     the ramblock dirty bitmap.
> 
> For more information please refer to the comments in the code.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  accel/kvm/kvm-all.c    | 426 ++++++++++++++++++++++++++++++++++++++++-
>  accel/kvm/trace-events |   7 +
>  include/hw/core/cpu.h  |  10 +
>  3 files changed, 440 insertions(+), 3 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 6d145a8b98..201617bbb7 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -15,6 +15,7 @@
>  
>  #include "qemu/osdep.h"
>  #include <sys/ioctl.h>
> +#include <poll.h>
>  
>  #include <linux/kvm.h>
>  
> @@ -75,6 +76,47 @@ struct KVMParkedVcpu {
>      QLIST_ENTRY(KVMParkedVcpu) node;
>  };
>  
> +enum KVMReaperState {
> +    KVM_REAPER_NONE = 0,
> +    /* The reaper is sleeping */
> +    KVM_REAPER_WAIT,
> +    /* The reaper is reaping for dirty pages */
> +    KVM_REAPER_REAPING,
> +};

That probably needs to be KVMDirtyRingReaperState
given there are many things that could be reaped.

> +/*
> + * KVM reaper instance, responsible for collecting the KVM dirty bits
> + * via the dirty ring.
> + */
> +struct KVMDirtyRingReaper {
> +    /* The reaper thread */
> +    QemuThread reaper_thr;
> +    /*
> +     * Telling the reaper thread to wakeup.  This should be used as a
> +     * generic interface to kick the reaper thread, like, in vcpu
> +     * threads where it gets a exit due to ring full.
> +     */
> +    EventNotifier reaper_event;

I think I'd just used a simple semaphore for this type of thing.

> +    /*
> +     * This should only be used when someone wants to do synchronous
> +     * flush of the dirty ring buffers.  Logically we can achieve this
> +     * even with the reaper_event only, however that'll make things
> +     * complicated.  This extra event can make the sync procedure easy
> +     * and clean.
> +     */
> +    EventNotifier reaper_flush_event;
> +    /*
> +     * Used in pair with reaper_flush_event, that the sem will be
> +     * posted to notify that the previous flush event is handled by
> +     * the reaper thread.
> +     */
> +    QemuSemaphore reaper_flush_sem;
> +    /* Iteration number of the reaper thread */
> +    volatile uint64_t reaper_iteration;
> +    /* Status of the reaper thread */
> +    volatile enum KVMReaperState reaper_state;
> +};
> +
>  struct KVMState
>  {
>      AccelState parent_obj;
> @@ -121,7 +163,6 @@ struct KVMState
>      void *memcrypt_handle;
>      int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
>  
> -    /* For "info mtree -f" to tell if an MR is registered in KVM */
>      int nr_as;
>      struct KVMAs {
>          KVMMemoryListener *ml;
> @@ -129,6 +170,7 @@ struct KVMState
>      } *as;
>      int kvm_dirty_ring_size;
>      int kvm_dirty_gfn_count;    /* If nonzero, then kvm dirty ring enabled */
> +    struct KVMDirtyRingReaper reaper;
>  };
>  
>  KVMState *kvm_state;
> @@ -348,6 +390,11 @@ int kvm_destroy_vcpu(CPUState *cpu)
>          goto err;
>      }
>  
> +    ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
>      vcpu = g_malloc0(sizeof(*vcpu));
>      vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
>      vcpu->kvm_fd = cpu->kvm_fd;
> @@ -391,6 +438,7 @@ int kvm_init_vcpu(CPUState *cpu)
>      cpu->kvm_fd = ret;
>      cpu->kvm_state = s;
>      cpu->vcpu_dirty = true;
> +    qemu_sem_init(&cpu->kvm_dirty_ring_avail, 0);
>  
>      mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>      if (mmap_size < 0) {
> @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu)
>              (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
>      }
>  
> +    if (s->kvm_dirty_gfn_count) {
> +        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> +                                   PROT_READ | PROT_WRITE, MAP_SHARED,

Is the MAP_SHARED required?

> +                                   cpu->kvm_fd,
> +                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> +        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> +            ret = -errno;
> +            DPRINTF("mmap'ing vcpu dirty gfns failed\n");

Include errno?

> +            goto err;
> +        }
> +    }
> +
>      ret = kvm_arch_init_vcpu(cpu);
>  err:
>      return ret;
> @@ -525,6 +585,11 @@ static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
>      cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
>  }
>  
> +static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
> +{
> +    memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
> +}
> +
>  #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
>  
>  /* Allocate the dirty bitmap for a slot  */
> @@ -1100,6 +1165,305 @@ out:
>      kvm_slots_unlock(kml);
>  }
>  
> +static void kvm_dirty_ring_reaper_kick(void)
> +{
> +    trace_kvm_dirty_ring_reaper_kick("any");
> +    event_notifier_set(&kvm_state->reaper.reaper_event);
> +}
> +
> +static void kvm_dirty_ring_reaper_kick_flush(void)
> +{
> +    trace_kvm_dirty_ring_reaper_kick("flush");
> +    event_notifier_set(&kvm_state->reaper.reaper_flush_event);
> +}
> +
> +/* Should be with all slots_lock held for the address spaces */
> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> +                                     uint32_t slot_id, uint64_t offset)
> +{
> +    KVMMemoryListener *kml;
> +    KVMSlot *mem;
> +
> +    assert(as_id < s->nr_as);
> +    /* These fields shouldn't change after VM inits */
> +    kml = s->as[as_id].ml;
> +    mem = &kml->slots[slot_id];
> +    set_bit(offset, mem->dirty_bmap);
> +}
> +
> +static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
> +{
> +    return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
> +}
> +
> +static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
> +{
> +    gfn->flags = KVM_DIRTY_GFN_F_RESET;
> +}
> +
> +/* Should be with all slots_lock held for the address spaces */
> +static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
> +{
> +    struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
> +    uint32_t gfn_count = s->kvm_dirty_gfn_count;
> +    uint32_t count = 0, fetch = cpu->kvm_fetch_index;
> +
> +    assert(dirty_gfns && gfn_count);
> +
> +    trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
> +
> +    while (true) {
> +        cur = &dirty_gfns[fetch % gfn_count];
> +        if (!dirty_gfn_is_dirtied(cur)) {
> +            break;
> +        }
> +        trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
> +        kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
> +                                 cur->offset);
> +        dirty_gfn_set_collected(cur);
> +        fetch++;
> +        count++;
> +    }
> +    cpu->kvm_fetch_index = fetch;
> +
> +    return count;
> +}
> +
> +static uint64_t kvm_dirty_ring_reap(KVMState *s)
> +{
> +    KVMMemoryListener *kml;
> +    int ret, i, locked_count = s->nr_as;
> +    CPUState *cpu;
> +    uint64_t total = 0;
> +
> +    /*
> +     * We need to lock all kvm slots for all address spaces here,
> +     * because:
> +     *
> +     * (1) We need to mark dirty for dirty bitmaps in multiple slots
> +     *     and for tons of pages, so it's better to take the lock here
> +     *     once rather than once per page.  And more importantly,
> +     *
> +     * (2) We must _NOT_ publish dirty bits to the other threads
> +     *     (e.g., the migration thread) via the kvm memory slot dirty
> +     *     bitmaps before correctly re-protect those dirtied pages.
> +     *     Otherwise we can have potential risk of data corruption if
> +     *     the page data is read in the other thread before we do
> +     *     reset below.
> +     */
> +    for (i = 0; i < s->nr_as; i++) {
> +        kml = s->as[i].ml;
> +        if (!kml) {
> +            /*
> +             * This is tricky - we grow s->as[] dynamically now.  Take
> +             * care of that case.  We also assumed the as[] will fill
> +             * one by one starting from zero.  Without this, we race
> +             * with register_smram_listener.
> +             *
> +             * TODO: make all these prettier...
> +             */
> +            locked_count = i;
> +            break;
> +        }
> +        kvm_slots_lock(kml);
> +    }
> +
> +    CPU_FOREACH(cpu) {
> +        total += kvm_dirty_ring_reap_one(s, cpu);
> +    }
> +
> +    if (total) {
> +        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
> +        assert(ret == total);
> +    }
> +
> +    /* Unlock whatever locks that we have locked */
> +    for (i = 0; i < locked_count; i++) {
> +        kvm_slots_unlock(s->as[i].ml);
> +    }
> +
> +    CPU_FOREACH(cpu) {
> +        if (cpu->kvm_dirty_ring_full) {
> +            qemu_sem_post(&cpu->kvm_dirty_ring_avail);
> +        }

Why do you need to wait until here - couldn't you release
each vcpu after you've reaped it?

> +    }
> +
> +    return total;
> +}
> +
> +static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
> +{
> +    /* No need to do anything */
> +}
> +
> +/*
> + * Kick all vcpus out in a synchronized way.  When returned, we
> + * guarantee that every vcpu has been kicked and at least returned to
> + * userspace once.
> + */
> +static void kvm_cpu_synchronize_kick_all(void)
> +{
> +    CPUState *cpu;
> +
> +    CPU_FOREACH(cpu) {
> +        run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
> +    }
> +}
> +
> +/*
> + * Flush all the existing dirty pages to the KVM slot buffers.  When
> + * this call returns, we guarantee that all the touched dirty pages
> + * before calling this function have been put into the per-kvmslot
> + * dirty bitmap.
> + *
> + * To achieve this, we need to:
> + *
> + * (1) Kick all vcpus out, this will make sure that we flush all the
> + *     dirty buffers that potentially in the hardware (PML) into the
> + *     dirty rings, after that,
> + *
> + * (2) Kick the reaper thread and make sure it reaps all the dirty
> + *     page that is in the dirty rings.
> + *
> + * This function must be called with BQL held.
> + */
> +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
> +{
> +    uint64_t iteration;
> +
> +    trace_kvm_dirty_ring_flush(0);
> +
> +    /*
> +     * The function needs to be serialized.  Since this function
> +     * should always be with BQL held, serialization is guaranteed.
> +     * However, let's be sure of it.
> +     */
> +    assert(qemu_mutex_iothread_locked());
> +
> +    /*
> +     * First make sure to flush the hardware buffers by kicking all
> +     * vcpus out in a synchronous way.
> +     */
> +    kvm_cpu_synchronize_kick_all();
> +
> +    iteration = r->reaper_iteration;
> +
> +    /*
> +     * Kick the reaper to collect data.  Here we must make sure that
> +     * it goes over a complete WAIT->REAPING->WAIT period so that we
> +     * know the reaper has collected all the dirty pages even in the
> +     * hardware buffers we just flushed.  To achieve this, we kick the
> +     * flush_event.
> +     */
> +    kvm_dirty_ring_reaper_kick_flush();
> +    qemu_sem_wait(&r->reaper_flush_sem);
> +
> +    /* When reach here, we must have finished at least one iteration */
> +    assert(r->reaper_iteration > iteration);
> +
> +    trace_kvm_dirty_ring_flush(1);
> +}
> +
> +static void *kvm_dirty_ring_reaper_thread(void *data)
> +{
> +    KVMState *s = data;
> +    struct KVMDirtyRingReaper *r = &s->reaper;
> +    struct pollfd *pfd = g_new0(struct pollfd, 2);
> +    uint64_t count;
> +    int64_t stamp;
> +    int ret;
> +
> +    rcu_register_thread();
> +
> +    trace_kvm_dirty_ring_reaper("init");
> +
> +    pfd[0].fd = event_notifier_get_fd(&r->reaper_event);
> +    pfd[0].events = POLLIN;
> +    pfd[1].fd = event_notifier_get_fd(&r->reaper_flush_event);
> +    pfd[1].events = POLLIN;
> +
> +    while (true) {
> +        bool flush_requested = false;
> +
> +        r->reaper_state = KVM_REAPER_WAIT;
> +        trace_kvm_dirty_ring_reaper("wait");
> +        /*
> +         * TODO: provide a smarter timeout rather than a constant?  If
> +         * this timeout is too small it could eat a lot of CPU
> +         * resource, however if too big then VGA could be less
> +         * responsive.  30ms is a value that is not too small so it
> +         * won't eat much CPU, while the VGA can still get ~30Hz
> +         * refresh rate.
> +         */
> +        ret = poll(pfd, 2, 30);
> +        trace_kvm_dirty_ring_reaper("wakeup");
> +        r->reaper_state = KVM_REAPER_REAPING;
> +
> +        if (ret == -1) {
> +            error_report("%s: poll() failed: %s", __func__, strerror(errno));
> +            break;
> +        }
> +
> +        /*
> +         * Note: we only handle one request at a time.  Also, we'll
> +         * clear the event flag before we reap, so each SET to the
> +         * event will guarantee that another full-reap procedure will
> +         * happen.
> +         */
> +        if (pfd[0].revents) {
> +            ret = event_notifier_test_and_clear(&r->reaper_event);
> +            assert(ret);
> +        } else if (pfd[1].revents) {
> +            ret = event_notifier_test_and_clear(&r->reaper_flush_event);
> +            assert(ret);
> +            flush_requested = true;
> +        }
> +
> +        stamp = get_clock();
> +        count = kvm_dirty_ring_reap(s);
> +        stamp = get_clock() - stamp;
> +
> +        r->reaper_iteration++;
> +
> +        if (count) {
> +            trace_kvm_dirty_ring_reaper_iterate(r->reaper_iteration,
> +                                                count, stamp / 1000);
> +        }
> +
> +        /*
> +         * If this iteration is to handle a flush event, wakeup the
> +         * requester of the flush
> +         */
> +        if (flush_requested) {
> +            qemu_sem_post(&r->reaper_flush_sem);
> +        }
> +    }
> +
> +    trace_kvm_dirty_ring_reaper("exit");
> +
> +    rcu_unregister_thread();
> +
> +    return NULL;
> +}
> +
> +static int kvm_dirty_ring_reaper_init(KVMState *s)
> +{
> +    struct KVMDirtyRingReaper *r = &s->reaper;
> +    int ret;
> +
> +    ret = event_notifier_init(&r->reaper_event, false);
> +    assert(ret == 0);
> +    ret = event_notifier_init(&r->reaper_flush_event, false);
> +    assert(ret == 0);
> +    qemu_sem_init(&r->reaper_flush_sem, 0);
> +
> +    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
> +                       kvm_dirty_ring_reaper_thread,
> +                       s, QEMU_THREAD_JOINABLE);

That's marked as joinable - does it ever get joined?
If the reaper thread does exit on error (I can only see the poll
failure?) - what happens elsewhere - will things still try and kick it?

Dave

> +
> +    return 0;
> +}
> +
>  static void kvm_region_add(MemoryListener *listener,
>                             MemoryRegionSection *section)
>  {
> @@ -1128,6 +1492,36 @@ static void kvm_log_sync(MemoryListener *listener,
>      kvm_slots_unlock(kml);
>  }
>  
> +static void kvm_log_sync_global(MemoryListener *l)
> +{
> +    KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
> +    KVMState *s = kvm_state;
> +    KVMSlot *mem;
> +    int i;
> +
> +    /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
> +    kvm_dirty_ring_flush(&s->reaper);
> +
> +    /*
> +     * TODO: make this faster when nr_slots is big while there are
> +     * only a few used slots (small VMs).
> +     */
> +    kvm_slots_lock(kml);
> +    for (i = 0; i < s->nr_slots; i++) {
> +        mem = &kml->slots[i];
> +        if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
> +            kvm_slot_sync_dirty_pages(mem);
> +            /*
> +             * This is not needed by KVM_GET_DIRTY_LOG because the
> +             * ioctl will unconditionally overwrite the whole region.
> +             * However kvm dirty ring has no such side effect.
> +             */
> +            kvm_slot_reset_dirty_pages(mem);
> +        }
> +    }
> +    kvm_slots_unlock(kml);
> +}
> +
>  static void kvm_log_clear(MemoryListener *listener,
>                            MemoryRegionSection *section)
>  {
> @@ -1234,10 +1628,17 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
>      kml->listener.region_del = kvm_region_del;
>      kml->listener.log_start = kvm_log_start;
>      kml->listener.log_stop = kvm_log_stop;
> -    kml->listener.log_sync = kvm_log_sync;
> -    kml->listener.log_clear = kvm_log_clear;
>      kml->listener.priority = 10;
>  
> +    if (s->kvm_dirty_gfn_count) {
> +        /* KVM dirty ring enabled */
> +        kml->listener.log_sync_global = kvm_log_sync_global;
> +    } else {
> +        /* KVM dirty logging enabled */
> +        kml->listener.log_sync = kvm_log_sync;
> +        kml->listener.log_clear = kvm_log_clear;
> +    }
> +
>      memory_listener_register(&kml->listener, as);
>  
>      for (i = 0; i < s->nr_as; ++i) {
> @@ -2120,6 +2521,13 @@ static int kvm_init(MachineState *ms)
>          qemu_balloon_inhibit(true);
>      }
>  
> +    if (s->kvm_dirty_gfn_count) {
> +        ret = kvm_dirty_ring_reaper_init(s);
> +        if (ret) {
> +            goto err;
> +        }
> +    }
> +
>      return 0;
>  
>  err:
> @@ -2427,6 +2835,18 @@ int kvm_cpu_exec(CPUState *cpu)
>          case KVM_EXIT_INTERNAL_ERROR:
>              ret = kvm_handle_internal_error(cpu, run);
>              break;
> +        case KVM_EXIT_DIRTY_RING_FULL:
> +            /*
> +             * We shouldn't continue if the dirty ring of this vcpu is
> +             * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
> +             */
> +            trace_kvm_dirty_ring_full(cpu->cpu_index);
> +            cpu->kvm_dirty_ring_full = true;
> +            kvm_dirty_ring_reaper_kick();
> +            qemu_sem_wait(&cpu->kvm_dirty_ring_avail);
> +            cpu->kvm_dirty_ring_full = false;
> +            ret = 0;
> +            break;
>          case KVM_EXIT_SYSTEM_EVENT:
>              switch (run->system_event.type) {
>              case KVM_SYSTEM_EVENT_SHUTDOWN:
> diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
> index 4fb6e59d19..17d6b6a154 100644
> --- a/accel/kvm/trace-events
> +++ b/accel/kvm/trace-events
> @@ -16,4 +16,11 @@ kvm_set_ioeventfd_mmio(int fd, uint64_t addr, uint32_t val, bool assign, uint32_
>  kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%x val=0x%x assign: %d size: %d match: %d"
>  kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
>  kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
> +kvm_dirty_ring_full(int id) "vcpu %d"
> +kvm_dirty_ring_reap_vcpu(int id) "vcpu %d"
> +kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch %"PRIu32" offset 0x%"PRIx64
> +kvm_dirty_ring_reaper(const char *s) "%s"
> +kvm_dirty_ring_reaper_iterate(uint64_t iter, uint64_t count, int64_t t) "iteration %"PRIu64" reaped %"PRIu64" pages (took %"PRIi64" us)"
> +kvm_dirty_ring_reaper_kick(const char *reason) "%s"
> +kvm_dirty_ring_flush(int finished) "%d"
>  
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index 73e9a869a4..a7cddb7b40 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -342,6 +342,11 @@ struct qemu_work_item;
>   * @ignore_memory_transaction_failures: Cached copy of the MachineState
>   *    flag of the same name: allows the board to suppress calling of the
>   *    CPU do_transaction_failed hook function.
> + * @kvm_dirty_ring_full:
> + *   Whether the kvm dirty ring of this vcpu is soft-full.
> + * @kvm_dirty_ring_avail:
> + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
> + *   available again.
>   *
>   * State of one CPU core or thread.
>   */
> @@ -409,9 +414,14 @@ struct CPUState {
>       */
>      uintptr_t mem_io_pc;
>  
> +    /* Only used in KVM */
>      int kvm_fd;
>      struct KVMState *kvm_state;
>      struct kvm_run *kvm_run;
> +    struct kvm_dirty_gfn *kvm_dirty_gfns;
> +    uint32_t kvm_fetch_index;
> +    QemuSemaphore kvm_dirty_ring_avail;
> +    bool kvm_dirty_ring_full;
>  
>      /* Used for events with 'vcpu' and *without* the 'disabled' properties */
>      DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);
> -- 
> 2.24.1
> 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu March 25, 2020, 9:32 p.m. UTC | #2
On Wed, Mar 25, 2020 at 08:41:44PM +0000, Dr. David Alan Gilbert wrote:

[...]

> > +enum KVMReaperState {
> > +    KVM_REAPER_NONE = 0,
> > +    /* The reaper is sleeping */
> > +    KVM_REAPER_WAIT,
> > +    /* The reaper is reaping for dirty pages */
> > +    KVM_REAPER_REAPING,
> > +};
> 
> That probably needs to be KVMDirtyRingReaperState
> given there are many things that could be reaped.

Sure.

> 
> > +/*
> > + * KVM reaper instance, responsible for collecting the KVM dirty bits
> > + * via the dirty ring.
> > + */
> > +struct KVMDirtyRingReaper {
> > +    /* The reaper thread */
> > +    QemuThread reaper_thr;
> > +    /*
> > +     * Telling the reaper thread to wakeup.  This should be used as a
> > +     * generic interface to kick the reaper thread, like, in vcpu
> > +     * threads where it gets a exit due to ring full.
> > +     */
> > +    EventNotifier reaper_event;
> 
> I think I'd just used a simple semaphore for this type of thing.

I'm actually uncertain on which is cheaper...

At the meantime, I wanted to poll two handles at the same time below
(in kvm_dirty_ring_reaper_thread).  I don't know how to do that with
semaphore.  Could it?

[...]

> > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu)
> >              (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> >      }
> >  
> > +    if (s->kvm_dirty_gfn_count) {
> > +        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> > +                                   PROT_READ | PROT_WRITE, MAP_SHARED,
> 
> Is the MAP_SHARED required?

Yes it's required.  It's the same when we map the per-vcpu kvm_run.

If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace
writes to the dirty gfns the 1st time, it'll copy the current dirty
ring page in the kernel and from now on QEMU will never be able to see
what the kernel writes to the dirty gfn pages.  MAP_SHARED means the
userspace and the kernel shares exactly the same page(s).

> 
> > +                                   cpu->kvm_fd,
> > +                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> > +        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> > +            ret = -errno;
> > +            DPRINTF("mmap'ing vcpu dirty gfns failed\n");
> 
> Include errno?

Will do.

[...]

> > +static uint64_t kvm_dirty_ring_reap(KVMState *s)
> > +{
> > +    KVMMemoryListener *kml;
> > +    int ret, i, locked_count = s->nr_as;
> > +    CPUState *cpu;
> > +    uint64_t total = 0;
> > +
> > +    /*
> > +     * We need to lock all kvm slots for all address spaces here,
> > +     * because:
> > +     *
> > +     * (1) We need to mark dirty for dirty bitmaps in multiple slots
> > +     *     and for tons of pages, so it's better to take the lock here
> > +     *     once rather than once per page.  And more importantly,
> > +     *
> > +     * (2) We must _NOT_ publish dirty bits to the other threads
> > +     *     (e.g., the migration thread) via the kvm memory slot dirty
> > +     *     bitmaps before correctly re-protect those dirtied pages.
> > +     *     Otherwise we can have potential risk of data corruption if
> > +     *     the page data is read in the other thread before we do
> > +     *     reset below.
> > +     */
> > +    for (i = 0; i < s->nr_as; i++) {
> > +        kml = s->as[i].ml;
> > +        if (!kml) {
> > +            /*
> > +             * This is tricky - we grow s->as[] dynamically now.  Take
> > +             * care of that case.  We also assumed the as[] will fill
> > +             * one by one starting from zero.  Without this, we race
> > +             * with register_smram_listener.
> > +             *
> > +             * TODO: make all these prettier...
> > +             */
> > +            locked_count = i;
> > +            break;
> > +        }
> > +        kvm_slots_lock(kml);
> > +    }
> > +
> > +    CPU_FOREACH(cpu) {
> > +        total += kvm_dirty_ring_reap_one(s, cpu);
> > +    }
> > +
> > +    if (total) {
> > +        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
> > +        assert(ret == total);
> > +    }
> > +
> > +    /* Unlock whatever locks that we have locked */
> > +    for (i = 0; i < locked_count; i++) {
> > +        kvm_slots_unlock(s->as[i].ml);
> > +    }
> > +
> > +    CPU_FOREACH(cpu) {
> > +        if (cpu->kvm_dirty_ring_full) {
> > +            qemu_sem_post(&cpu->kvm_dirty_ring_avail);
> > +        }
> 
> Why do you need to wait until here - couldn't you release
> each vcpu after you've reaped it?

We probably still need to wait.  Even after we reaped all the dirty
bits we only marked the pages as "collected", the buffers will only be
available again until the kernel re-protect those pages (when the
above KVM_RESET_DIRTY_RINGS completes).  Before that, continuing the
vcpu could let it exit again with the same ring full event.

[...]

> > +static int kvm_dirty_ring_reaper_init(KVMState *s)
> > +{
> > +    struct KVMDirtyRingReaper *r = &s->reaper;
> > +    int ret;
> > +
> > +    ret = event_notifier_init(&r->reaper_event, false);
> > +    assert(ret == 0);
> > +    ret = event_notifier_init(&r->reaper_flush_event, false);
> > +    assert(ret == 0);
> > +    qemu_sem_init(&r->reaper_flush_sem, 0);
> > +
> > +    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
> > +                       kvm_dirty_ring_reaper_thread,
> > +                       s, QEMU_THREAD_JOINABLE);
> 
> That's marked as joinable - does it ever get joined?
> If the reaper thread does exit on error (I can only see the poll
> failure?) - what happens elsewhere - will things still try and kick it?

The reaper thread is not designed to fail for sure. If it fails, it'll
exit without being joined, but otherwise I'll just abort() directly in
the thread which seems to be not anything better...

Regarding to "why not join() it": I think it's simply because we don't
have corresponding operation to AccelClass.init_machine() and
kvm_init(). :-) From that POV, I think I'll still use JOINABLE with
the hope that someday the destroy_machine() hook will be ready and
we'll be able to join it.

Thanks,
Dr. David Alan Gilbert March 26, 2020, 2:14 p.m. UTC | #3
* Peter Xu (peterx@redhat.com) wrote:
> On Wed, Mar 25, 2020 at 08:41:44PM +0000, Dr. David Alan Gilbert wrote:
> 
> [...]
> 
> > > +enum KVMReaperState {
> > > +    KVM_REAPER_NONE = 0,
> > > +    /* The reaper is sleeping */
> > > +    KVM_REAPER_WAIT,
> > > +    /* The reaper is reaping for dirty pages */
> > > +    KVM_REAPER_REAPING,
> > > +};
> > 
> > That probably needs to be KVMDirtyRingReaperState
> > given there are many things that could be reaped.
> 
> Sure.
> 
> > 
> > > +/*
> > > + * KVM reaper instance, responsible for collecting the KVM dirty bits
> > > + * via the dirty ring.
> > > + */
> > > +struct KVMDirtyRingReaper {
> > > +    /* The reaper thread */
> > > +    QemuThread reaper_thr;
> > > +    /*
> > > +     * Telling the reaper thread to wakeup.  This should be used as a
> > > +     * generic interface to kick the reaper thread, like, in vcpu
> > > +     * threads where it gets a exit due to ring full.
> > > +     */
> > > +    EventNotifier reaper_event;
> > 
> > I think I'd just used a simple semaphore for this type of thing.
> 
> I'm actually uncertain on which is cheaper...
> 
> At the meantime, I wanted to poll two handles at the same time below
> (in kvm_dirty_ring_reaper_thread).  I don't know how to do that with
> semaphore.  Could it?

If you're OK with EventNotifier stick with it;  it's just I'm used
to doing with it with a semaphore; e.g. a flag then the semaphore - but
that's fine.

> [...]
> 
> > > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu)
> > >              (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> > >      }
> > >  
> > > +    if (s->kvm_dirty_gfn_count) {
> > > +        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> > > +                                   PROT_READ | PROT_WRITE, MAP_SHARED,
> > 
> > Is the MAP_SHARED required?
> 
> Yes it's required.  It's the same when we map the per-vcpu kvm_run.
> 
> If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace
> writes to the dirty gfns the 1st time, it'll copy the current dirty
> ring page in the kernel and from now on QEMU will never be able to see
> what the kernel writes to the dirty gfn pages.  MAP_SHARED means the
> userspace and the kernel shares exactly the same page(s).

OK, worth a comment.

> > 
> > > +                                   cpu->kvm_fd,
> > > +                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> > > +        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> > > +            ret = -errno;
> > > +            DPRINTF("mmap'ing vcpu dirty gfns failed\n");
> > 
> > Include errno?
> 
> Will do.
> 
> [...]
> 
> > > +static uint64_t kvm_dirty_ring_reap(KVMState *s)
> > > +{
> > > +    KVMMemoryListener *kml;
> > > +    int ret, i, locked_count = s->nr_as;
> > > +    CPUState *cpu;
> > > +    uint64_t total = 0;
> > > +
> > > +    /*
> > > +     * We need to lock all kvm slots for all address spaces here,
> > > +     * because:
> > > +     *
> > > +     * (1) We need to mark dirty for dirty bitmaps in multiple slots
> > > +     *     and for tons of pages, so it's better to take the lock here
> > > +     *     once rather than once per page.  And more importantly,
> > > +     *
> > > +     * (2) We must _NOT_ publish dirty bits to the other threads
> > > +     *     (e.g., the migration thread) via the kvm memory slot dirty
> > > +     *     bitmaps before correctly re-protect those dirtied pages.
> > > +     *     Otherwise we can have potential risk of data corruption if
> > > +     *     the page data is read in the other thread before we do
> > > +     *     reset below.
> > > +     */
> > > +    for (i = 0; i < s->nr_as; i++) {
> > > +        kml = s->as[i].ml;
> > > +        if (!kml) {
> > > +            /*
> > > +             * This is tricky - we grow s->as[] dynamically now.  Take
> > > +             * care of that case.  We also assumed the as[] will fill
> > > +             * one by one starting from zero.  Without this, we race
> > > +             * with register_smram_listener.
> > > +             *
> > > +             * TODO: make all these prettier...
> > > +             */
> > > +            locked_count = i;
> > > +            break;
> > > +        }
> > > +        kvm_slots_lock(kml);
> > > +    }
> > > +
> > > +    CPU_FOREACH(cpu) {
> > > +        total += kvm_dirty_ring_reap_one(s, cpu);
> > > +    }
> > > +
> > > +    if (total) {
> > > +        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
> > > +        assert(ret == total);
> > > +    }
> > > +
> > > +    /* Unlock whatever locks that we have locked */
> > > +    for (i = 0; i < locked_count; i++) {
> > > +        kvm_slots_unlock(s->as[i].ml);
> > > +    }
> > > +
> > > +    CPU_FOREACH(cpu) {
> > > +        if (cpu->kvm_dirty_ring_full) {
> > > +            qemu_sem_post(&cpu->kvm_dirty_ring_avail);
> > > +        }
> > 
> > Why do you need to wait until here - couldn't you release
> > each vcpu after you've reaped it?
> 
> We probably still need to wait.  Even after we reaped all the dirty
> bits we only marked the pages as "collected", the buffers will only be
> available again until the kernel re-protect those pages (when the
> above KVM_RESET_DIRTY_RINGS completes).  Before that, continuing the
> vcpu could let it exit again with the same ring full event.

Ah OK.

> [...]
> 
> > > +static int kvm_dirty_ring_reaper_init(KVMState *s)
> > > +{
> > > +    struct KVMDirtyRingReaper *r = &s->reaper;
> > > +    int ret;
> > > +
> > > +    ret = event_notifier_init(&r->reaper_event, false);
> > > +    assert(ret == 0);
> > > +    ret = event_notifier_init(&r->reaper_flush_event, false);
> > > +    assert(ret == 0);
> > > +    qemu_sem_init(&r->reaper_flush_sem, 0);
> > > +
> > > +    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
> > > +                       kvm_dirty_ring_reaper_thread,
> > > +                       s, QEMU_THREAD_JOINABLE);
> > 
> > That's marked as joinable - does it ever get joined?
> > If the reaper thread does exit on error (I can only see the poll
> > failure?) - what happens elsewhere - will things still try and kick it?
> 
> The reaper thread is not designed to fail for sure. If it fails, it'll
> exit without being joined, but otherwise I'll just abort() directly in
> the thread which seems to be not anything better...
> 
> Regarding to "why not join() it": I think it's simply because we don't
> have corresponding operation to AccelClass.init_machine() and
> kvm_init(). :-) From that POV, I think I'll still use JOINABLE with
> the hope that someday the destroy_machine() hook will be ready and
> we'll be able to join it.

OK, that's fine.

Dave

> Thanks,
> 
> -- 
> Peter Xu
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu March 26, 2020, 4:10 p.m. UTC | #4
On Thu, Mar 26, 2020 at 02:14:36PM +0000, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Wed, Mar 25, 2020 at 08:41:44PM +0000, Dr. David Alan Gilbert wrote:
> > 
> > [...]
> > 
> > > > +enum KVMReaperState {
> > > > +    KVM_REAPER_NONE = 0,
> > > > +    /* The reaper is sleeping */
> > > > +    KVM_REAPER_WAIT,
> > > > +    /* The reaper is reaping for dirty pages */
> > > > +    KVM_REAPER_REAPING,
> > > > +};
> > > 
> > > That probably needs to be KVMDirtyRingReaperState
> > > given there are many things that could be reaped.
> > 
> > Sure.
> > 
> > > 
> > > > +/*
> > > > + * KVM reaper instance, responsible for collecting the KVM dirty bits
> > > > + * via the dirty ring.
> > > > + */
> > > > +struct KVMDirtyRingReaper {
> > > > +    /* The reaper thread */
> > > > +    QemuThread reaper_thr;
> > > > +    /*
> > > > +     * Telling the reaper thread to wakeup.  This should be used as a
> > > > +     * generic interface to kick the reaper thread, like, in vcpu
> > > > +     * threads where it gets a exit due to ring full.
> > > > +     */
> > > > +    EventNotifier reaper_event;
> > > 
> > > I think I'd just used a simple semaphore for this type of thing.
> > 
> > I'm actually uncertain on which is cheaper...
> > 
> > At the meantime, I wanted to poll two handles at the same time below
> > (in kvm_dirty_ring_reaper_thread).  I don't know how to do that with
> > semaphore.  Could it?
> 
> If you're OK with EventNotifier stick with it;  it's just I'm used
> to doing with it with a semaphore; e.g. a flag then the semaphore - but
> that's fine.

Ah yes flags could work, though we probably need to be careful with
flags and use atomic accesses to avoid race conditions of flag lost.

Then I'll keep it, thanks.

> 
> > [...]
> > 
> > > > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu)
> > > >              (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> > > >      }
> > > >  
> > > > +    if (s->kvm_dirty_gfn_count) {
> > > > +        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> > > > +                                   PROT_READ | PROT_WRITE, MAP_SHARED,
> > > 
> > > Is the MAP_SHARED required?
> > 
> > Yes it's required.  It's the same when we map the per-vcpu kvm_run.
> > 
> > If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace
> > writes to the dirty gfns the 1st time, it'll copy the current dirty
> > ring page in the kernel and from now on QEMU will never be able to see
> > what the kernel writes to the dirty gfn pages.  MAP_SHARED means the
> > userspace and the kernel shares exactly the same page(s).
> 
> OK, worth a comment.

Sure.
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 6d145a8b98..201617bbb7 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -15,6 +15,7 @@ 
 
 #include "qemu/osdep.h"
 #include <sys/ioctl.h>
+#include <poll.h>
 
 #include <linux/kvm.h>
 
@@ -75,6 +76,47 @@  struct KVMParkedVcpu {
     QLIST_ENTRY(KVMParkedVcpu) node;
 };
 
+enum KVMReaperState {
+    KVM_REAPER_NONE = 0,
+    /* The reaper is sleeping */
+    KVM_REAPER_WAIT,
+    /* The reaper is reaping for dirty pages */
+    KVM_REAPER_REAPING,
+};
+
+/*
+ * KVM reaper instance, responsible for collecting the KVM dirty bits
+ * via the dirty ring.
+ */
+struct KVMDirtyRingReaper {
+    /* The reaper thread */
+    QemuThread reaper_thr;
+    /*
+     * Telling the reaper thread to wakeup.  This should be used as a
+     * generic interface to kick the reaper thread, like, in vcpu
+     * threads where it gets a exit due to ring full.
+     */
+    EventNotifier reaper_event;
+    /*
+     * This should only be used when someone wants to do synchronous
+     * flush of the dirty ring buffers.  Logically we can achieve this
+     * even with the reaper_event only, however that'll make things
+     * complicated.  This extra event can make the sync procedure easy
+     * and clean.
+     */
+    EventNotifier reaper_flush_event;
+    /*
+     * Used in pair with reaper_flush_event, that the sem will be
+     * posted to notify that the previous flush event is handled by
+     * the reaper thread.
+     */
+    QemuSemaphore reaper_flush_sem;
+    /* Iteration number of the reaper thread */
+    volatile uint64_t reaper_iteration;
+    /* Status of the reaper thread */
+    volatile enum KVMReaperState reaper_state;
+};
+
 struct KVMState
 {
     AccelState parent_obj;
@@ -121,7 +163,6 @@  struct KVMState
     void *memcrypt_handle;
     int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
 
-    /* For "info mtree -f" to tell if an MR is registered in KVM */
     int nr_as;
     struct KVMAs {
         KVMMemoryListener *ml;
@@ -129,6 +170,7 @@  struct KVMState
     } *as;
     int kvm_dirty_ring_size;
     int kvm_dirty_gfn_count;    /* If nonzero, then kvm dirty ring enabled */
+    struct KVMDirtyRingReaper reaper;
 };
 
 KVMState *kvm_state;
@@ -348,6 +390,11 @@  int kvm_destroy_vcpu(CPUState *cpu)
         goto err;
     }
 
+    ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
+    if (ret < 0) {
+        goto err;
+    }
+
     vcpu = g_malloc0(sizeof(*vcpu));
     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
     vcpu->kvm_fd = cpu->kvm_fd;
@@ -391,6 +438,7 @@  int kvm_init_vcpu(CPUState *cpu)
     cpu->kvm_fd = ret;
     cpu->kvm_state = s;
     cpu->vcpu_dirty = true;
+    qemu_sem_init(&cpu->kvm_dirty_ring_avail, 0);
 
     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
     if (mmap_size < 0) {
@@ -412,6 +460,18 @@  int kvm_init_vcpu(CPUState *cpu)
             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
     }
 
+    if (s->kvm_dirty_gfn_count) {
+        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
+                                   PROT_READ | PROT_WRITE, MAP_SHARED,
+                                   cpu->kvm_fd,
+                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
+        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
+            ret = -errno;
+            DPRINTF("mmap'ing vcpu dirty gfns failed\n");
+            goto err;
+        }
+    }
+
     ret = kvm_arch_init_vcpu(cpu);
 err:
     return ret;
@@ -525,6 +585,11 @@  static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
     cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
 }
 
+static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
+{
+    memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
+}
+
 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 
 /* Allocate the dirty bitmap for a slot  */
@@ -1100,6 +1165,305 @@  out:
     kvm_slots_unlock(kml);
 }
 
+static void kvm_dirty_ring_reaper_kick(void)
+{
+    trace_kvm_dirty_ring_reaper_kick("any");
+    event_notifier_set(&kvm_state->reaper.reaper_event);
+}
+
+static void kvm_dirty_ring_reaper_kick_flush(void)
+{
+    trace_kvm_dirty_ring_reaper_kick("flush");
+    event_notifier_set(&kvm_state->reaper.reaper_flush_event);
+}
+
+/* Should be with all slots_lock held for the address spaces */
+static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
+                                     uint32_t slot_id, uint64_t offset)
+{
+    KVMMemoryListener *kml;
+    KVMSlot *mem;
+
+    assert(as_id < s->nr_as);
+    /* These fields shouldn't change after VM inits */
+    kml = s->as[as_id].ml;
+    mem = &kml->slots[slot_id];
+    set_bit(offset, mem->dirty_bmap);
+}
+
+static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+    return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+    gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+/* Should be with all slots_lock held for the address spaces */
+static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
+{
+    struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
+    uint32_t gfn_count = s->kvm_dirty_gfn_count;
+    uint32_t count = 0, fetch = cpu->kvm_fetch_index;
+
+    assert(dirty_gfns && gfn_count);
+
+    trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
+
+    while (true) {
+        cur = &dirty_gfns[fetch % gfn_count];
+        if (!dirty_gfn_is_dirtied(cur)) {
+            break;
+        }
+        trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
+        kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
+                                 cur->offset);
+        dirty_gfn_set_collected(cur);
+        fetch++;
+        count++;
+    }
+    cpu->kvm_fetch_index = fetch;
+
+    return count;
+}
+
+static uint64_t kvm_dirty_ring_reap(KVMState *s)
+{
+    KVMMemoryListener *kml;
+    int ret, i, locked_count = s->nr_as;
+    CPUState *cpu;
+    uint64_t total = 0;
+
+    /*
+     * We need to lock all kvm slots for all address spaces here,
+     * because:
+     *
+     * (1) We need to mark dirty for dirty bitmaps in multiple slots
+     *     and for tons of pages, so it's better to take the lock here
+     *     once rather than once per page.  And more importantly,
+     *
+     * (2) We must _NOT_ publish dirty bits to the other threads
+     *     (e.g., the migration thread) via the kvm memory slot dirty
+     *     bitmaps before correctly re-protect those dirtied pages.
+     *     Otherwise we can have potential risk of data corruption if
+     *     the page data is read in the other thread before we do
+     *     reset below.
+     */
+    for (i = 0; i < s->nr_as; i++) {
+        kml = s->as[i].ml;
+        if (!kml) {
+            /*
+             * This is tricky - we grow s->as[] dynamically now.  Take
+             * care of that case.  We also assumed the as[] will fill
+             * one by one starting from zero.  Without this, we race
+             * with register_smram_listener.
+             *
+             * TODO: make all these prettier...
+             */
+            locked_count = i;
+            break;
+        }
+        kvm_slots_lock(kml);
+    }
+
+    CPU_FOREACH(cpu) {
+        total += kvm_dirty_ring_reap_one(s, cpu);
+    }
+
+    if (total) {
+        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
+        assert(ret == total);
+    }
+
+    /* Unlock whatever locks that we have locked */
+    for (i = 0; i < locked_count; i++) {
+        kvm_slots_unlock(s->as[i].ml);
+    }
+
+    CPU_FOREACH(cpu) {
+        if (cpu->kvm_dirty_ring_full) {
+            qemu_sem_post(&cpu->kvm_dirty_ring_avail);
+        }
+    }
+
+    return total;
+}
+
+static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
+{
+    /* No need to do anything */
+}
+
+/*
+ * Kick all vcpus out in a synchronized way.  When returned, we
+ * guarantee that every vcpu has been kicked and at least returned to
+ * userspace once.
+ */
+static void kvm_cpu_synchronize_kick_all(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
+    }
+}
+
+/*
+ * Flush all the existing dirty pages to the KVM slot buffers.  When
+ * this call returns, we guarantee that all the touched dirty pages
+ * before calling this function have been put into the per-kvmslot
+ * dirty bitmap.
+ *
+ * To achieve this, we need to:
+ *
+ * (1) Kick all vcpus out, this will make sure that we flush all the
+ *     dirty buffers that potentially in the hardware (PML) into the
+ *     dirty rings, after that,
+ *
+ * (2) Kick the reaper thread and make sure it reaps all the dirty
+ *     page that is in the dirty rings.
+ *
+ * This function must be called with BQL held.
+ */
+static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
+{
+    uint64_t iteration;
+
+    trace_kvm_dirty_ring_flush(0);
+
+    /*
+     * The function needs to be serialized.  Since this function
+     * should always be with BQL held, serialization is guaranteed.
+     * However, let's be sure of it.
+     */
+    assert(qemu_mutex_iothread_locked());
+
+    /*
+     * First make sure to flush the hardware buffers by kicking all
+     * vcpus out in a synchronous way.
+     */
+    kvm_cpu_synchronize_kick_all();
+
+    iteration = r->reaper_iteration;
+
+    /*
+     * Kick the reaper to collect data.  Here we must make sure that
+     * it goes over a complete WAIT->REAPING->WAIT period so that we
+     * know the reaper has collected all the dirty pages even in the
+     * hardware buffers we just flushed.  To achieve this, we kick the
+     * flush_event.
+     */
+    kvm_dirty_ring_reaper_kick_flush();
+    qemu_sem_wait(&r->reaper_flush_sem);
+
+    /* When reach here, we must have finished at least one iteration */
+    assert(r->reaper_iteration > iteration);
+
+    trace_kvm_dirty_ring_flush(1);
+}
+
+static void *kvm_dirty_ring_reaper_thread(void *data)
+{
+    KVMState *s = data;
+    struct KVMDirtyRingReaper *r = &s->reaper;
+    struct pollfd *pfd = g_new0(struct pollfd, 2);
+    uint64_t count;
+    int64_t stamp;
+    int ret;
+
+    rcu_register_thread();
+
+    trace_kvm_dirty_ring_reaper("init");
+
+    pfd[0].fd = event_notifier_get_fd(&r->reaper_event);
+    pfd[0].events = POLLIN;
+    pfd[1].fd = event_notifier_get_fd(&r->reaper_flush_event);
+    pfd[1].events = POLLIN;
+
+    while (true) {
+        bool flush_requested = false;
+
+        r->reaper_state = KVM_REAPER_WAIT;
+        trace_kvm_dirty_ring_reaper("wait");
+        /*
+         * TODO: provide a smarter timeout rather than a constant?  If
+         * this timeout is too small it could eat a lot of CPU
+         * resource, however if too big then VGA could be less
+         * responsive.  30ms is a value that is not too small so it
+         * won't eat much CPU, while the VGA can still get ~30Hz
+         * refresh rate.
+         */
+        ret = poll(pfd, 2, 30);
+        trace_kvm_dirty_ring_reaper("wakeup");
+        r->reaper_state = KVM_REAPER_REAPING;
+
+        if (ret == -1) {
+            error_report("%s: poll() failed: %s", __func__, strerror(errno));
+            break;
+        }
+
+        /*
+         * Note: we only handle one request at a time.  Also, we'll
+         * clear the event flag before we reap, so each SET to the
+         * event will guarantee that another full-reap procedure will
+         * happen.
+         */
+        if (pfd[0].revents) {
+            ret = event_notifier_test_and_clear(&r->reaper_event);
+            assert(ret);
+        } else if (pfd[1].revents) {
+            ret = event_notifier_test_and_clear(&r->reaper_flush_event);
+            assert(ret);
+            flush_requested = true;
+        }
+
+        stamp = get_clock();
+        count = kvm_dirty_ring_reap(s);
+        stamp = get_clock() - stamp;
+
+        r->reaper_iteration++;
+
+        if (count) {
+            trace_kvm_dirty_ring_reaper_iterate(r->reaper_iteration,
+                                                count, stamp / 1000);
+        }
+
+        /*
+         * If this iteration is to handle a flush event, wakeup the
+         * requester of the flush
+         */
+        if (flush_requested) {
+            qemu_sem_post(&r->reaper_flush_sem);
+        }
+    }
+
+    trace_kvm_dirty_ring_reaper("exit");
+
+    rcu_unregister_thread();
+
+    return NULL;
+}
+
+static int kvm_dirty_ring_reaper_init(KVMState *s)
+{
+    struct KVMDirtyRingReaper *r = &s->reaper;
+    int ret;
+
+    ret = event_notifier_init(&r->reaper_event, false);
+    assert(ret == 0);
+    ret = event_notifier_init(&r->reaper_flush_event, false);
+    assert(ret == 0);
+    qemu_sem_init(&r->reaper_flush_sem, 0);
+
+    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
+                       kvm_dirty_ring_reaper_thread,
+                       s, QEMU_THREAD_JOINABLE);
+
+    return 0;
+}
+
 static void kvm_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
@@ -1128,6 +1492,36 @@  static void kvm_log_sync(MemoryListener *listener,
     kvm_slots_unlock(kml);
 }
 
+static void kvm_log_sync_global(MemoryListener *l)
+{
+    KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
+    KVMState *s = kvm_state;
+    KVMSlot *mem;
+    int i;
+
+    /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
+    kvm_dirty_ring_flush(&s->reaper);
+
+    /*
+     * TODO: make this faster when nr_slots is big while there are
+     * only a few used slots (small VMs).
+     */
+    kvm_slots_lock(kml);
+    for (i = 0; i < s->nr_slots; i++) {
+        mem = &kml->slots[i];
+        if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+            kvm_slot_sync_dirty_pages(mem);
+            /*
+             * This is not needed by KVM_GET_DIRTY_LOG because the
+             * ioctl will unconditionally overwrite the whole region.
+             * However kvm dirty ring has no such side effect.
+             */
+            kvm_slot_reset_dirty_pages(mem);
+        }
+    }
+    kvm_slots_unlock(kml);
+}
+
 static void kvm_log_clear(MemoryListener *listener,
                           MemoryRegionSection *section)
 {
@@ -1234,10 +1628,17 @@  void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
     kml->listener.region_del = kvm_region_del;
     kml->listener.log_start = kvm_log_start;
     kml->listener.log_stop = kvm_log_stop;
-    kml->listener.log_sync = kvm_log_sync;
-    kml->listener.log_clear = kvm_log_clear;
     kml->listener.priority = 10;
 
+    if (s->kvm_dirty_gfn_count) {
+        /* KVM dirty ring enabled */
+        kml->listener.log_sync_global = kvm_log_sync_global;
+    } else {
+        /* KVM dirty logging enabled */
+        kml->listener.log_sync = kvm_log_sync;
+        kml->listener.log_clear = kvm_log_clear;
+    }
+
     memory_listener_register(&kml->listener, as);
 
     for (i = 0; i < s->nr_as; ++i) {
@@ -2120,6 +2521,13 @@  static int kvm_init(MachineState *ms)
         qemu_balloon_inhibit(true);
     }
 
+    if (s->kvm_dirty_gfn_count) {
+        ret = kvm_dirty_ring_reaper_init(s);
+        if (ret) {
+            goto err;
+        }
+    }
+
     return 0;
 
 err:
@@ -2427,6 +2835,18 @@  int kvm_cpu_exec(CPUState *cpu)
         case KVM_EXIT_INTERNAL_ERROR:
             ret = kvm_handle_internal_error(cpu, run);
             break;
+        case KVM_EXIT_DIRTY_RING_FULL:
+            /*
+             * We shouldn't continue if the dirty ring of this vcpu is
+             * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
+             */
+            trace_kvm_dirty_ring_full(cpu->cpu_index);
+            cpu->kvm_dirty_ring_full = true;
+            kvm_dirty_ring_reaper_kick();
+            qemu_sem_wait(&cpu->kvm_dirty_ring_avail);
+            cpu->kvm_dirty_ring_full = false;
+            ret = 0;
+            break;
         case KVM_EXIT_SYSTEM_EVENT:
             switch (run->system_event.type) {
             case KVM_SYSTEM_EVENT_SHUTDOWN:
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index 4fb6e59d19..17d6b6a154 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -16,4 +16,11 @@  kvm_set_ioeventfd_mmio(int fd, uint64_t addr, uint32_t val, bool assign, uint32_
 kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%x val=0x%x assign: %d size: %d match: %d"
 kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
 kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
+kvm_dirty_ring_full(int id) "vcpu %d"
+kvm_dirty_ring_reap_vcpu(int id) "vcpu %d"
+kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch %"PRIu32" offset 0x%"PRIx64
+kvm_dirty_ring_reaper(const char *s) "%s"
+kvm_dirty_ring_reaper_iterate(uint64_t iter, uint64_t count, int64_t t) "iteration %"PRIu64" reaped %"PRIu64" pages (took %"PRIi64" us)"
+kvm_dirty_ring_reaper_kick(const char *reason) "%s"
+kvm_dirty_ring_flush(int finished) "%d"
 
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 73e9a869a4..a7cddb7b40 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -342,6 +342,11 @@  struct qemu_work_item;
  * @ignore_memory_transaction_failures: Cached copy of the MachineState
  *    flag of the same name: allows the board to suppress calling of the
  *    CPU do_transaction_failed hook function.
+ * @kvm_dirty_ring_full:
+ *   Whether the kvm dirty ring of this vcpu is soft-full.
+ * @kvm_dirty_ring_avail:
+ *   Semaphore to be posted when the kvm dirty ring of the vcpu is
+ *   available again.
  *
  * State of one CPU core or thread.
  */
@@ -409,9 +414,14 @@  struct CPUState {
      */
     uintptr_t mem_io_pc;
 
+    /* Only used in KVM */
     int kvm_fd;
     struct KVMState *kvm_state;
     struct kvm_run *kvm_run;
+    struct kvm_dirty_gfn *kvm_dirty_gfns;
+    uint32_t kvm_fetch_index;
+    QemuSemaphore kvm_dirty_ring_avail;
+    bool kvm_dirty_ring_full;
 
     /* Used for events with 'vcpu' and *without* the 'disabled' properties */
     DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);