diff mbox series

[v5,10/10] KVM: Dirty ring support

Message ID 20210310203301.194842-11-peterx@redhat.com
State New
Headers show
Series KVM: Dirty ring support (QEMU part) | expand

Commit Message

Peter Xu March 10, 2021, 8:33 p.m. UTC
KVM dirty ring is a new interface to pass over dirty bits from kernel to the
userspace.  Instead of using a bitmap for each memory region, the dirty ring
contains an array of dirtied GPAs to fetch (in the form of offset in slots).
For each vcpu there will be one dirty ring that binds to it.

kvm_dirty_ring_reap() is the major function to collect dirty rings.  It can be
called either by a standalone reaper thread that runs in the background,
collecting dirty pages for the whole VM.  It can also be called directly by any
thread that has BQL taken.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 accel/kvm/kvm-all.c    | 332 ++++++++++++++++++++++++++++++++++++++++-
 accel/kvm/trace-events |   7 +
 include/hw/core/cpu.h  |   8 +
 3 files changed, 344 insertions(+), 3 deletions(-)

Comments

Keqian Zhu March 22, 2021, 1:37 p.m. UTC | #1
On 2021/3/11 4:33, Peter Xu wrote:
> KVM dirty ring is a new interface to pass over dirty bits from kernel to the
> userspace.  Instead of using a bitmap for each memory region, the dirty ring
> contains an array of dirtied GPAs to fetch (in the form of offset in slots).
> For each vcpu there will be one dirty ring that binds to it.
> 
> kvm_dirty_ring_reap() is the major function to collect dirty rings.  It can be
> called either by a standalone reaper thread that runs in the background,
> collecting dirty pages for the whole VM.  It can also be called directly by any
> thread that has BQL taken.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  accel/kvm/kvm-all.c    | 332 ++++++++++++++++++++++++++++++++++++++++-
>  accel/kvm/trace-events |   7 +
>  include/hw/core/cpu.h  |   8 +
>  3 files changed, 344 insertions(+), 3 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index ae9393266b2..bf2b21f038b 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -15,6 +15,7 @@
>  
>  #include "qemu/osdep.h"
>  #include <sys/ioctl.h>
> +#include <poll.h>
>  
>  #include <linux/kvm.h>
>  
> @@ -80,6 +81,25 @@ struct KVMParkedVcpu {
>      QLIST_ENTRY(KVMParkedVcpu) node;
>  };
>  
> +enum KVMDirtyRingReaperState {
> +    KVM_DIRTY_RING_REAPER_NONE = 0,
> +    /* The reaper is sleeping */
> +    KVM_DIRTY_RING_REAPER_WAIT,
> +    /* The reaper is reaping for dirty pages */
> +    KVM_DIRTY_RING_REAPER_REAPING,
> +};
> +
> +/*
> + * KVM reaper instance, responsible for collecting the KVM dirty bits
> + * via the dirty ring.
> + */
> +struct KVMDirtyRingReaper {
> +    /* The reaper thread */
> +    QemuThread reaper_thr;
> +    volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
> +    volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
> +};
> +
>  struct KVMState
>  {
>      AccelState parent_obj;
> @@ -131,6 +151,7 @@ struct KVMState
>      bool kvm_dirty_ring_enabled;    /* Whether KVM dirty ring is enabled */
>      uint64_t kvm_dirty_ring_size;   /* Size of the per-vcpu dirty ring */
>      uint32_t kvm_dirty_gfn_count;   /* Number of dirty GFNs per ring */
> +    struct KVMDirtyRingReaper reaper;
>  };
>  
>  KVMState *kvm_state;
> @@ -392,6 +413,13 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
>          goto err;
>      }
>  
> +    if (cpu->kvm_dirty_gfns) {
> +        ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
> +        if (ret < 0) {
> +            goto err;
> +        }
> +    }
> +
>      vcpu = g_malloc0(sizeof(*vcpu));
>      vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
>      vcpu->kvm_fd = cpu->kvm_fd;
> @@ -468,6 +496,19 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
>              (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
>      }
>  
> +    if (s->kvm_dirty_ring_enabled) {
> +        /* Use MAP_SHARED to share pages with the kernel */
> +        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> +                                   PROT_READ | PROT_WRITE, MAP_SHARED,
> +                                   cpu->kvm_fd,
> +                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
> +        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> +            ret = -errno;
> +            DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
> +            goto err;
> +        }
> +    }
> +
>      ret = kvm_arch_init_vcpu(cpu);
>      if (ret < 0) {
>          error_setg_errno(errp, -ret,
> @@ -586,6 +627,11 @@ static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
>      cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
>  }
>  
> +static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
> +{
> +    memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
> +}
> +
>  #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
>  
>  /* Allocate the dirty bitmap for a slot  */
> @@ -642,6 +688,170 @@ static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
>      return ret == 0;
>  }
>  
> +/* Should be with all slots_lock held for the address spaces. */
> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> +                                     uint32_t slot_id, uint64_t offset)
> +{
> +    KVMMemoryListener *kml;
> +    KVMSlot *mem;
> +
> +    if (as_id >= s->nr_as) {
> +        return;
> +    }
> +
> +    kml = s->as[as_id].ml;
> +    mem = &kml->slots[slot_id];
> +
> +    if (!mem->memory_size || offset >= (mem->memory_size / TARGET_PAGE_SIZE)) {
It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.


> +        return;
> +    }
> +
> +    set_bit(offset, mem->dirty_bmap);
> +}
> +
> +static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
> +{
> +    return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
> +}
> +
> +static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
> +{
> +    gfn->flags = KVM_DIRTY_GFN_F_RESET;
> +}
> +
> +/*
> + * Should be with all slots_lock held for the address spaces.  It returns the
> + * dirty page we've collected on this dirty ring.
> + */
> +static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
> +{
> +    struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
> +    uint32_t gfn_count = s->kvm_dirty_gfn_count;
> +    uint32_t count = 0, fetch = cpu->kvm_fetch_index;
> +
> +    assert(dirty_gfns && gfn_count);
> +    trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
> +
> +    while (true) {
> +        cur = &dirty_gfns[fetch % gfn_count];
> +        if (!dirty_gfn_is_dirtied(cur)) {
> +            break;
> +        }
> +        kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
> +                                 cur->offset);
> +        dirty_gfn_set_collected(cur);
> +        trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
> +        fetch++;
> +        count++;
> +    }
> +    cpu->kvm_fetch_index = fetch;
> +
> +    return count;
> +}
> +
> +/* Must be with slots_lock held */
> +static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
> +{
> +    int ret;
> +    CPUState *cpu;
> +    uint64_t total = 0;
> +    int64_t stamp;
> +
> +    stamp = get_clock();
> +
> +    CPU_FOREACH(cpu) {
> +        total += kvm_dirty_ring_reap_one(s, cpu);
> +    }
> +
> +    if (total) {
> +        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
> +        assert(ret == total);
> +    }
> +
> +    stamp = get_clock() - stamp;
> +
> +    if (total) {
> +        trace_kvm_dirty_ring_reap(total, stamp / 1000);
> +    }
> +
> +    return total;
> +}
> +
> +/*
> + * Currently for simplicity, we must hold BQL before calling this.  We can
> + * consider to drop the BQL if we're clear with all the race conditions.
> + */
> +static uint64_t kvm_dirty_ring_reap(KVMState *s)
> +{
> +    uint64_t total;
> +
> +    /*
> +     * We need to lock all kvm slots for all address spaces here,
> +     * because:
> +     *
> +     * (1) We need to mark dirty for dirty bitmaps in multiple slots
> +     *     and for tons of pages, so it's better to take the lock here
> +     *     once rather than once per page.  And more importantly,
> +     *
> +     * (2) We must _NOT_ publish dirty bits to the other threads
> +     *     (e.g., the migration thread) via the kvm memory slot dirty
> +     *     bitmaps before correctly re-protect those dirtied pages.
> +     *     Otherwise we can have potential risk of data corruption if
> +     *     the page data is read in the other thread before we do
> +     *     reset below.
> +     */
> +    kvm_slots_lock();
> +    total = kvm_dirty_ring_reap_locked(s);
> +    kvm_slots_unlock();
> +
> +    return total;
> +}
> +
> +static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
> +{
> +    /* No need to do anything */
> +}
> +
> +/*
> + * Kick all vcpus out in a synchronized way.  When returned, we
> + * guarantee that every vcpu has been kicked and at least returned to
> + * userspace once.
> + */
> +static void kvm_cpu_synchronize_kick_all(void)
> +{
> +    CPUState *cpu;
> +
> +    CPU_FOREACH(cpu) {
> +        run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
> +    }
> +}
> +
> +/*
> + * Flush all the existing dirty pages to the KVM slot buffers.  When
> + * this call returns, we guarantee that all the touched dirty pages
> + * before calling this function have been put into the per-kvmslot
> + * dirty bitmap.
> + *
> + * This function must be called with BQL held.
> + */
> +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
The argument is not used.

> +{
> +    trace_kvm_dirty_ring_flush(0);
> +    /*
> +     * The function needs to be serialized.  Since this function
> +     * should always be with BQL held, serialization is guaranteed.
> +     * However, let's be sure of it.
> +     */
> +    assert(qemu_mutex_iothread_locked());
> +    /*
> +     * First make sure to flush the hardware buffers by kicking all
> +     * vcpus out in a synchronous way.
> +     */
> +    kvm_cpu_synchronize_kick_all();
Can we make this function to be architecture specific?
It seems that kick out vCPU is an architecture specific way to flush hardware buffers
to dirty ring (for x86 PML).

> +    kvm_dirty_ring_reap(kvm_state);
> +    trace_kvm_dirty_ring_flush(1);
> +}
> +
>  /**
>   * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
>   *
> @@ -1174,7 +1384,24 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
>                  goto out;
>              }
>              if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
> -                kvm_slot_get_dirty_log(kvm_state, mem);
> +                /*
> +                 * NOTE: We should be aware of the fact that here we're only
> +                 * doing a best effort to sync dirty bits.  No matter whether
> +                 * we're using dirty log or dirty ring, we ignored two facts:
> +                 *
> +                 * (1) dirty bits can reside in hardware buffers (PML)
> +                 *
> +                 * (2) after we collected dirty bits here, pages can be dirtied
> +                 * again before we do the final KVM_SET_USER_MEMORY_REGION to
> +                 * remove the slot.
> +                 *
> +                 * Not easy.  Let's cross the fingers until it's fixed.
> +                 */
> +                if (kvm_state->kvm_dirty_ring_enabled) {
> +                    kvm_dirty_ring_reap_locked(kvm_state);
> +                } else {
> +                    kvm_slot_get_dirty_log(kvm_state, mem);
> +                }
>                  kvm_slot_sync_dirty_pages(mem);
>              }
>  
> @@ -1222,6 +1449,51 @@ out:
>      kvm_slots_unlock();
>  }
>  
> +static void *kvm_dirty_ring_reaper_thread(void *data)
> +{
> +    KVMState *s = data;
> +    struct KVMDirtyRingReaper *r = &s->reaper;
> +
> +    rcu_register_thread();
> +
> +    trace_kvm_dirty_ring_reaper("init");
> +
> +    while (true) {
> +        r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
> +        trace_kvm_dirty_ring_reaper("wait");
> +        /*
> +         * TODO: provide a smarter timeout rather than a constant?
> +         */
> +        sleep(1);
> +
> +        trace_kvm_dirty_ring_reaper("wakeup");
> +        r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
> +
> +        qemu_mutex_lock_iothread();
> +        kvm_dirty_ring_reap(s);
> +        qemu_mutex_unlock_iothread();
> +
> +        r->reaper_iteration++;
> +    }
I don't know when does this iteration exit?
And I see that we start this reaper_thread in kvm_init(), maybe it's better to start it
when start dirty log and stop it when stop dirty log.

> +
> +    trace_kvm_dirty_ring_reaper("exit");
> +
> +    rcu_unregister_thread();
> +
> +    return NULL;
> +}
> +
> +static int kvm_dirty_ring_reaper_init(KVMState *s)
> +{
> +    struct KVMDirtyRingReaper *r = &s->reaper;
> +
> +    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
> +                       kvm_dirty_ring_reaper_thread,
> +                       s, QEMU_THREAD_JOINABLE);
> +
> +    return 0;
> +}
> +
>  static void kvm_region_add(MemoryListener *listener,
>                             MemoryRegionSection *section)
>  {
> @@ -1250,6 +1522,36 @@ static void kvm_log_sync(MemoryListener *listener,
>      kvm_slots_unlock();
>  }
>  
> +static void kvm_log_sync_global(MemoryListener *l)
> +{
> +    KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
> +    KVMState *s = kvm_state;
> +    KVMSlot *mem;
> +    int i;
> +
> +    /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
> +    kvm_dirty_ring_flush(&s->reaper);
> +
> +    /*
> +     * TODO: make this faster when nr_slots is big while there are
> +     * only a few used slots (small VMs).
> +     */
> +    kvm_slots_lock();
> +    for (i = 0; i < s->nr_slots; i++) {
> +        mem = &kml->slots[i];
> +        if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
> +            kvm_slot_sync_dirty_pages(mem);
> +            /*
> +             * This is not needed by KVM_GET_DIRTY_LOG because the
> +             * ioctl will unconditionally overwrite the whole region.
> +             * However kvm dirty ring has no such side effect.
> +             */
> +            kvm_slot_reset_dirty_pages(mem);
> +        }
> +    }
> +    kvm_slots_unlock();
> +}
> +
>  static void kvm_log_clear(MemoryListener *listener,
>                            MemoryRegionSection *section)
>  {
> @@ -1356,10 +1658,15 @@ void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
>      kml->listener.region_del = kvm_region_del;
>      kml->listener.log_start = kvm_log_start;
>      kml->listener.log_stop = kvm_log_stop;
> -    kml->listener.log_sync = kvm_log_sync;
> -    kml->listener.log_clear = kvm_log_clear;
>      kml->listener.priority = 10;
>  
> +    if (s->kvm_dirty_ring_enabled) {
> +        kml->listener.log_sync_global = kvm_log_sync_global;
> +    } else {
> +        kml->listener.log_sync = kvm_log_sync;
> +        kml->listener.log_clear = kvm_log_clear;
> +    }
> +
>      memory_listener_register(&kml->listener, as);
>  
>      for (i = 0; i < s->nr_as; ++i) {
> @@ -2281,6 +2588,14 @@ static int kvm_init(MachineState *ms)
>          ret = ram_block_discard_disable(true);
>          assert(!ret);
>      }
> +
> +    if (s->kvm_dirty_ring_enabled) {
> +        ret = kvm_dirty_ring_reaper_init(s);
> +        if (ret) {
> +            goto err;
> +        }
> +    }
> +
>      return 0;
>  
>  err:
> @@ -2593,6 +2908,17 @@ int kvm_cpu_exec(CPUState *cpu)
>          case KVM_EXIT_INTERNAL_ERROR:
>              ret = kvm_handle_internal_error(cpu, run);
>              break;
> +        case KVM_EXIT_DIRTY_RING_FULL:
> +            /*
> +             * We shouldn't continue if the dirty ring of this vcpu is
> +             * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
> +             */
> +            trace_kvm_dirty_ring_full(cpu->cpu_index);
> +            qemu_mutex_lock_iothread();
> +            kvm_dirty_ring_reap(kvm_state);
> +            qemu_mutex_unlock_iothread();
> +            ret = 0;
> +            break;
>          case KVM_EXIT_SYSTEM_EVENT:
>              switch (run->system_event.type) {
>              case KVM_SYSTEM_EVENT_SHUTDOWN:
> diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
> index e15ae8980d3..72a01320a1a 100644
> --- a/accel/kvm/trace-events
> +++ b/accel/kvm/trace-events
> @@ -18,4 +18,11 @@ kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t
>  kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
>  kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
>  kvm_resample_fd_notify(int gsi) "gsi %d"
> +kvm_dirty_ring_full(int id) "vcpu %d"
> +kvm_dirty_ring_reap_vcpu(int id) "vcpu %d"
> +kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch %"PRIu32" offset 0x%"PRIx64
> +kvm_dirty_ring_reaper(const char *s) "%s"
> +kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)"
> +kvm_dirty_ring_reaper_kick(const char *reason) "%s"
> +kvm_dirty_ring_flush(int finished) "%d"
>  
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index c68bc3ba8af..2f0991d93f7 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -323,6 +323,11 @@ struct qemu_work_item;
>   * @ignore_memory_transaction_failures: Cached copy of the MachineState
>   *    flag of the same name: allows the board to suppress calling of the
>   *    CPU do_transaction_failed hook function.
> + * @kvm_dirty_ring_full:
> + *   Whether the kvm dirty ring of this vcpu is soft-full.
> + * @kvm_dirty_ring_avail:
> + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
> + *   available again.
The doc does not match code.

>   *
>   * State of one CPU core or thread.
>   */
> @@ -394,9 +399,12 @@ struct CPUState {
>       */
>      uintptr_t mem_io_pc;
>  
> +    /* Only used in KVM */
>      int kvm_fd;
>      struct KVMState *kvm_state;
>      struct kvm_run *kvm_run;
> +    struct kvm_dirty_gfn *kvm_dirty_gfns;
> +    uint32_t kvm_fetch_index;
>  
>      /* Used for events with 'vcpu' and *without* the 'disabled' properties */
>      DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);
> 

Thanks,
Keqian
Peter Xu March 22, 2021, 6:52 p.m. UTC | #2
On Mon, Mar 22, 2021 at 09:37:19PM +0800, Keqian Zhu wrote:
> > +/* Should be with all slots_lock held for the address spaces. */
> > +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
> > +                                     uint32_t slot_id, uint64_t offset)
> > +{
> > +    KVMMemoryListener *kml;
> > +    KVMSlot *mem;
> > +
> > +    if (as_id >= s->nr_as) {
> > +        return;
> > +    }
> > +
> > +    kml = s->as[as_id].ml;
> > +    mem = &kml->slots[slot_id];
> > +
> > +    if (!mem->memory_size || offset >= (mem->memory_size / TARGET_PAGE_SIZE)) {
> It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.

Fixed.

[...]

> > +/*
> > + * Flush all the existing dirty pages to the KVM slot buffers.  When
> > + * this call returns, we guarantee that all the touched dirty pages
> > + * before calling this function have been put into the per-kvmslot
> > + * dirty bitmap.
> > + *
> > + * This function must be called with BQL held.
> > + */
> > +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
> The argument is not used.

Indeed, removed.

> 
> > +{
> > +    trace_kvm_dirty_ring_flush(0);
> > +    /*
> > +     * The function needs to be serialized.  Since this function
> > +     * should always be with BQL held, serialization is guaranteed.
> > +     * However, let's be sure of it.
> > +     */
> > +    assert(qemu_mutex_iothread_locked());
> > +    /*
> > +     * First make sure to flush the hardware buffers by kicking all
> > +     * vcpus out in a synchronous way.
> > +     */
> > +    kvm_cpu_synchronize_kick_all();
> Can we make this function to be architecture specific?
> It seems that kick out vCPU is an architecture specific way to flush hardware buffers
> to dirty ring (for x86 PML).

I can do that, but I'd say it's kind of an overkill if after all the kernel
support is not there yet, so I still tend to make it as simple as possible.

[...]

> > +static void *kvm_dirty_ring_reaper_thread(void *data)
> > +{
> > +    KVMState *s = data;
> > +    struct KVMDirtyRingReaper *r = &s->reaper;
> > +
> > +    rcu_register_thread();
> > +
> > +    trace_kvm_dirty_ring_reaper("init");
> > +
> > +    while (true) {
> > +        r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
> > +        trace_kvm_dirty_ring_reaper("wait");
> > +        /*
> > +         * TODO: provide a smarter timeout rather than a constant?
> > +         */
> > +        sleep(1);
> > +
> > +        trace_kvm_dirty_ring_reaper("wakeup");
> > +        r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
> > +
> > +        qemu_mutex_lock_iothread();
> > +        kvm_dirty_ring_reap(s);
> > +        qemu_mutex_unlock_iothread();
> > +
> > +        r->reaper_iteration++;
> > +    }
> I don't know when does this iteration exit?
> And I see that we start this reaper_thread in kvm_init(), maybe it's better to start it
> when start dirty log and stop it when stop dirty log.

Yes we can make it conditional, but note that we can't hook at functions like
memory_global_dirty_log_start() because that is only for migration purpose.

Currently QEMU exports the dirty tracking more than that, e.g., to the VGA
code.  We'll need to try to detect whether there's any existing MR got its
mr->dirty_log_mask set (besides global_dirty_log being set).  When all of them
got cleared we'll need to detect too so as to turn the thread off.

It's just easier to me to run this thread with such a timeout, then when not
necessary it'll see empty ring and return fast (index comparison for each
ring).  Not to mention the VGA dirty tracking should be on for most of the VM
lifecycle, so even if we have that knob this thread will probably be running
for 99% of the time as long as any MR has its DIRTY_MEMORY_VGA bit set.

[...]

> > diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> > index c68bc3ba8af..2f0991d93f7 100644
> > --- a/include/hw/core/cpu.h
> > +++ b/include/hw/core/cpu.h
> > @@ -323,6 +323,11 @@ struct qemu_work_item;
> >   * @ignore_memory_transaction_failures: Cached copy of the MachineState
> >   *    flag of the same name: allows the board to suppress calling of the
> >   *    CPU do_transaction_failed hook function.
> > + * @kvm_dirty_ring_full:
> > + *   Whether the kvm dirty ring of this vcpu is soft-full.
> > + * @kvm_dirty_ring_avail:
> > + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
> > + *   available again.
> The doc does not match code.

Right; fixed.

Thanks for taking a look, keqian.
Keqian Zhu March 23, 2021, 1:25 a.m. UTC | #3
On 2021/3/23 2:52, Peter Xu wrote:
> On Mon, Mar 22, 2021 at 09:37:19PM +0800, Keqian Zhu wrote:
>>> +/* Should be with all slots_lock held for the address spaces. */
>>> +static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
>>> +                                     uint32_t slot_id, uint64_t offset)
>>> +{
>>> +    KVMMemoryListener *kml;
>>> +    KVMSlot *mem;
>>> +
>>> +    if (as_id >= s->nr_as) {
>>> +        return;
>>> +    }
>>> +
>>> +    kml = s->as[as_id].ml;
>>> +    mem = &kml->slots[slot_id];
>>> +
>>> +    if (!mem->memory_size || offset >= (mem->memory_size / TARGET_PAGE_SIZE)) {
>> It seems that TARGET_PAGE_SIZE should be qemu_real_host_page_size.
> 
> Fixed.
> 
> [...]
> 
>>> +/*
>>> + * Flush all the existing dirty pages to the KVM slot buffers.  When
>>> + * this call returns, we guarantee that all the touched dirty pages
>>> + * before calling this function have been put into the per-kvmslot
>>> + * dirty bitmap.
>>> + *
>>> + * This function must be called with BQL held.
>>> + */
>>> +static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
>> The argument is not used.
> 
> Indeed, removed.
> 
>>
>>> +{
>>> +    trace_kvm_dirty_ring_flush(0);
>>> +    /*
>>> +     * The function needs to be serialized.  Since this function
>>> +     * should always be with BQL held, serialization is guaranteed.
>>> +     * However, let's be sure of it.
>>> +     */
>>> +    assert(qemu_mutex_iothread_locked());
>>> +    /*
>>> +     * First make sure to flush the hardware buffers by kicking all
>>> +     * vcpus out in a synchronous way.
>>> +     */
>>> +    kvm_cpu_synchronize_kick_all();
>> Can we make this function to be architecture specific?
>> It seems that kick out vCPU is an architecture specific way to flush hardware buffers
>> to dirty ring (for x86 PML).
> 
> I can do that, but I'd say it's kind of an overkill if after all the kernel
> support is not there yet, so I still tend to make it as simple as possible.
OK.

> 
> [...]
> 
>>> +static void *kvm_dirty_ring_reaper_thread(void *data)
>>> +{
>>> +    KVMState *s = data;
>>> +    struct KVMDirtyRingReaper *r = &s->reaper;
>>> +
>>> +    rcu_register_thread();
>>> +
>>> +    trace_kvm_dirty_ring_reaper("init");
>>> +
>>> +    while (true) {
>>> +        r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
>>> +        trace_kvm_dirty_ring_reaper("wait");
>>> +        /*
>>> +         * TODO: provide a smarter timeout rather than a constant?
>>> +         */
>>> +        sleep(1);
>>> +
>>> +        trace_kvm_dirty_ring_reaper("wakeup");
>>> +        r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
>>> +
>>> +        qemu_mutex_lock_iothread();
>>> +        kvm_dirty_ring_reap(s);
>>> +        qemu_mutex_unlock_iothread();
>>> +
>>> +        r->reaper_iteration++;
>>> +    }
>> I don't know when does this iteration exit?
>> And I see that we start this reaper_thread in kvm_init(), maybe it's better to start it
>> when start dirty log and stop it when stop dirty log.
> 
> Yes we can make it conditional, but note that we can't hook at functions like
> memory_global_dirty_log_start() because that is only for migration purpose.
> 
> Currently QEMU exports the dirty tracking more than that, e.g., to the VGA
> code.  We'll need to try to detect whether there's any existing MR got its
> mr->dirty_log_mask set (besides global_dirty_log being set).  When all of them
> got cleared we'll need to detect too so as to turn the thread off.
> 
> It's just easier to me to run this thread with such a timeout, then when not
> necessary it'll see empty ring and return fast (index comparison for each
> ring).  Not to mention the VGA dirty tracking should be on for most of the VM
> lifecycle, so even if we have that knob this thread will probably be running
> for 99% of the time as long as any MR has its DIRTY_MEMORY_VGA bit set.
Make sense. Thanks for your explanation!

Thanks,
Keqian
> 
> [...]
> 
>>> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
>>> index c68bc3ba8af..2f0991d93f7 100644
>>> --- a/include/hw/core/cpu.h
>>> +++ b/include/hw/core/cpu.h
>>> @@ -323,6 +323,11 @@ struct qemu_work_item;
>>>   * @ignore_memory_transaction_failures: Cached copy of the MachineState
>>>   *    flag of the same name: allows the board to suppress calling of the
>>>   *    CPU do_transaction_failed hook function.
>>> + * @kvm_dirty_ring_full:
>>> + *   Whether the kvm dirty ring of this vcpu is soft-full.
>>> + * @kvm_dirty_ring_avail:
>>> + *   Semaphore to be posted when the kvm dirty ring of the vcpu is
>>> + *   available again.
>> The doc does not match code.
> 
> Right; fixed.
> 
> Thanks for taking a look, keqian.
>
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ae9393266b2..bf2b21f038b 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -15,6 +15,7 @@ 
 
 #include "qemu/osdep.h"
 #include <sys/ioctl.h>
+#include <poll.h>
 
 #include <linux/kvm.h>
 
@@ -80,6 +81,25 @@  struct KVMParkedVcpu {
     QLIST_ENTRY(KVMParkedVcpu) node;
 };
 
+enum KVMDirtyRingReaperState {
+    KVM_DIRTY_RING_REAPER_NONE = 0,
+    /* The reaper is sleeping */
+    KVM_DIRTY_RING_REAPER_WAIT,
+    /* The reaper is reaping for dirty pages */
+    KVM_DIRTY_RING_REAPER_REAPING,
+};
+
+/*
+ * KVM reaper instance, responsible for collecting the KVM dirty bits
+ * via the dirty ring.
+ */
+struct KVMDirtyRingReaper {
+    /* The reaper thread */
+    QemuThread reaper_thr;
+    volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
+    volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
+};
+
 struct KVMState
 {
     AccelState parent_obj;
@@ -131,6 +151,7 @@  struct KVMState
     bool kvm_dirty_ring_enabled;    /* Whether KVM dirty ring is enabled */
     uint64_t kvm_dirty_ring_size;   /* Size of the per-vcpu dirty ring */
     uint32_t kvm_dirty_gfn_count;   /* Number of dirty GFNs per ring */
+    struct KVMDirtyRingReaper reaper;
 };
 
 KVMState *kvm_state;
@@ -392,6 +413,13 @@  static int do_kvm_destroy_vcpu(CPUState *cpu)
         goto err;
     }
 
+    if (cpu->kvm_dirty_gfns) {
+        ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_size);
+        if (ret < 0) {
+            goto err;
+        }
+    }
+
     vcpu = g_malloc0(sizeof(*vcpu));
     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
     vcpu->kvm_fd = cpu->kvm_fd;
@@ -468,6 +496,19 @@  int kvm_init_vcpu(CPUState *cpu, Error **errp)
             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
     }
 
+    if (s->kvm_dirty_ring_enabled) {
+        /* Use MAP_SHARED to share pages with the kernel */
+        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
+                                   PROT_READ | PROT_WRITE, MAP_SHARED,
+                                   cpu->kvm_fd,
+                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
+        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
+            ret = -errno;
+            DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
+            goto err;
+        }
+    }
+
     ret = kvm_arch_init_vcpu(cpu);
     if (ret < 0) {
         error_setg_errno(errp, -ret,
@@ -586,6 +627,11 @@  static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
     cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
 }
 
+static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
+{
+    memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
+}
+
 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 
 /* Allocate the dirty bitmap for a slot  */
@@ -642,6 +688,170 @@  static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
     return ret == 0;
 }
 
+/* Should be with all slots_lock held for the address spaces. */
+static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
+                                     uint32_t slot_id, uint64_t offset)
+{
+    KVMMemoryListener *kml;
+    KVMSlot *mem;
+
+    if (as_id >= s->nr_as) {
+        return;
+    }
+
+    kml = s->as[as_id].ml;
+    mem = &kml->slots[slot_id];
+
+    if (!mem->memory_size || offset >= (mem->memory_size / TARGET_PAGE_SIZE)) {
+        return;
+    }
+
+    set_bit(offset, mem->dirty_bmap);
+}
+
+static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
+{
+    return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
+}
+
+static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
+{
+    gfn->flags = KVM_DIRTY_GFN_F_RESET;
+}
+
+/*
+ * Should be with all slots_lock held for the address spaces.  It returns the
+ * dirty page we've collected on this dirty ring.
+ */
+static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
+{
+    struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
+    uint32_t gfn_count = s->kvm_dirty_gfn_count;
+    uint32_t count = 0, fetch = cpu->kvm_fetch_index;
+
+    assert(dirty_gfns && gfn_count);
+    trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
+
+    while (true) {
+        cur = &dirty_gfns[fetch % gfn_count];
+        if (!dirty_gfn_is_dirtied(cur)) {
+            break;
+        }
+        kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
+                                 cur->offset);
+        dirty_gfn_set_collected(cur);
+        trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
+        fetch++;
+        count++;
+    }
+    cpu->kvm_fetch_index = fetch;
+
+    return count;
+}
+
+/* Must be with slots_lock held */
+static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
+{
+    int ret;
+    CPUState *cpu;
+    uint64_t total = 0;
+    int64_t stamp;
+
+    stamp = get_clock();
+
+    CPU_FOREACH(cpu) {
+        total += kvm_dirty_ring_reap_one(s, cpu);
+    }
+
+    if (total) {
+        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
+        assert(ret == total);
+    }
+
+    stamp = get_clock() - stamp;
+
+    if (total) {
+        trace_kvm_dirty_ring_reap(total, stamp / 1000);
+    }
+
+    return total;
+}
+
+/*
+ * Currently for simplicity, we must hold BQL before calling this.  We can
+ * consider to drop the BQL if we're clear with all the race conditions.
+ */
+static uint64_t kvm_dirty_ring_reap(KVMState *s)
+{
+    uint64_t total;
+
+    /*
+     * We need to lock all kvm slots for all address spaces here,
+     * because:
+     *
+     * (1) We need to mark dirty for dirty bitmaps in multiple slots
+     *     and for tons of pages, so it's better to take the lock here
+     *     once rather than once per page.  And more importantly,
+     *
+     * (2) We must _NOT_ publish dirty bits to the other threads
+     *     (e.g., the migration thread) via the kvm memory slot dirty
+     *     bitmaps before correctly re-protect those dirtied pages.
+     *     Otherwise we can have potential risk of data corruption if
+     *     the page data is read in the other thread before we do
+     *     reset below.
+     */
+    kvm_slots_lock();
+    total = kvm_dirty_ring_reap_locked(s);
+    kvm_slots_unlock();
+
+    return total;
+}
+
+static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
+{
+    /* No need to do anything */
+}
+
+/*
+ * Kick all vcpus out in a synchronized way.  When returned, we
+ * guarantee that every vcpu has been kicked and at least returned to
+ * userspace once.
+ */
+static void kvm_cpu_synchronize_kick_all(void)
+{
+    CPUState *cpu;
+
+    CPU_FOREACH(cpu) {
+        run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
+    }
+}
+
+/*
+ * Flush all the existing dirty pages to the KVM slot buffers.  When
+ * this call returns, we guarantee that all the touched dirty pages
+ * before calling this function have been put into the per-kvmslot
+ * dirty bitmap.
+ *
+ * This function must be called with BQL held.
+ */
+static void kvm_dirty_ring_flush(struct KVMDirtyRingReaper *r)
+{
+    trace_kvm_dirty_ring_flush(0);
+    /*
+     * The function needs to be serialized.  Since this function
+     * should always be with BQL held, serialization is guaranteed.
+     * However, let's be sure of it.
+     */
+    assert(qemu_mutex_iothread_locked());
+    /*
+     * First make sure to flush the hardware buffers by kicking all
+     * vcpus out in a synchronous way.
+     */
+    kvm_cpu_synchronize_kick_all();
+    kvm_dirty_ring_reap(kvm_state);
+    trace_kvm_dirty_ring_flush(1);
+}
+
 /**
  * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
  *
@@ -1174,7 +1384,24 @@  static void kvm_set_phys_mem(KVMMemoryListener *kml,
                 goto out;
             }
             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
-                kvm_slot_get_dirty_log(kvm_state, mem);
+                /*
+                 * NOTE: We should be aware of the fact that here we're only
+                 * doing a best effort to sync dirty bits.  No matter whether
+                 * we're using dirty log or dirty ring, we ignored two facts:
+                 *
+                 * (1) dirty bits can reside in hardware buffers (PML)
+                 *
+                 * (2) after we collected dirty bits here, pages can be dirtied
+                 * again before we do the final KVM_SET_USER_MEMORY_REGION to
+                 * remove the slot.
+                 *
+                 * Not easy.  Let's cross the fingers until it's fixed.
+                 */
+                if (kvm_state->kvm_dirty_ring_enabled) {
+                    kvm_dirty_ring_reap_locked(kvm_state);
+                } else {
+                    kvm_slot_get_dirty_log(kvm_state, mem);
+                }
                 kvm_slot_sync_dirty_pages(mem);
             }
 
@@ -1222,6 +1449,51 @@  out:
     kvm_slots_unlock();
 }
 
+static void *kvm_dirty_ring_reaper_thread(void *data)
+{
+    KVMState *s = data;
+    struct KVMDirtyRingReaper *r = &s->reaper;
+
+    rcu_register_thread();
+
+    trace_kvm_dirty_ring_reaper("init");
+
+    while (true) {
+        r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
+        trace_kvm_dirty_ring_reaper("wait");
+        /*
+         * TODO: provide a smarter timeout rather than a constant?
+         */
+        sleep(1);
+
+        trace_kvm_dirty_ring_reaper("wakeup");
+        r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
+
+        qemu_mutex_lock_iothread();
+        kvm_dirty_ring_reap(s);
+        qemu_mutex_unlock_iothread();
+
+        r->reaper_iteration++;
+    }
+
+    trace_kvm_dirty_ring_reaper("exit");
+
+    rcu_unregister_thread();
+
+    return NULL;
+}
+
+static int kvm_dirty_ring_reaper_init(KVMState *s)
+{
+    struct KVMDirtyRingReaper *r = &s->reaper;
+
+    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
+                       kvm_dirty_ring_reaper_thread,
+                       s, QEMU_THREAD_JOINABLE);
+
+    return 0;
+}
+
 static void kvm_region_add(MemoryListener *listener,
                            MemoryRegionSection *section)
 {
@@ -1250,6 +1522,36 @@  static void kvm_log_sync(MemoryListener *listener,
     kvm_slots_unlock();
 }
 
+static void kvm_log_sync_global(MemoryListener *l)
+{
+    KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
+    KVMState *s = kvm_state;
+    KVMSlot *mem;
+    int i;
+
+    /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
+    kvm_dirty_ring_flush(&s->reaper);
+
+    /*
+     * TODO: make this faster when nr_slots is big while there are
+     * only a few used slots (small VMs).
+     */
+    kvm_slots_lock();
+    for (i = 0; i < s->nr_slots; i++) {
+        mem = &kml->slots[i];
+        if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+            kvm_slot_sync_dirty_pages(mem);
+            /*
+             * This is not needed by KVM_GET_DIRTY_LOG because the
+             * ioctl will unconditionally overwrite the whole region.
+             * However kvm dirty ring has no such side effect.
+             */
+            kvm_slot_reset_dirty_pages(mem);
+        }
+    }
+    kvm_slots_unlock();
+}
+
 static void kvm_log_clear(MemoryListener *listener,
                           MemoryRegionSection *section)
 {
@@ -1356,10 +1658,15 @@  void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
     kml->listener.region_del = kvm_region_del;
     kml->listener.log_start = kvm_log_start;
     kml->listener.log_stop = kvm_log_stop;
-    kml->listener.log_sync = kvm_log_sync;
-    kml->listener.log_clear = kvm_log_clear;
     kml->listener.priority = 10;
 
+    if (s->kvm_dirty_ring_enabled) {
+        kml->listener.log_sync_global = kvm_log_sync_global;
+    } else {
+        kml->listener.log_sync = kvm_log_sync;
+        kml->listener.log_clear = kvm_log_clear;
+    }
+
     memory_listener_register(&kml->listener, as);
 
     for (i = 0; i < s->nr_as; ++i) {
@@ -2281,6 +2588,14 @@  static int kvm_init(MachineState *ms)
         ret = ram_block_discard_disable(true);
         assert(!ret);
     }
+
+    if (s->kvm_dirty_ring_enabled) {
+        ret = kvm_dirty_ring_reaper_init(s);
+        if (ret) {
+            goto err;
+        }
+    }
+
     return 0;
 
 err:
@@ -2593,6 +2908,17 @@  int kvm_cpu_exec(CPUState *cpu)
         case KVM_EXIT_INTERNAL_ERROR:
             ret = kvm_handle_internal_error(cpu, run);
             break;
+        case KVM_EXIT_DIRTY_RING_FULL:
+            /*
+             * We shouldn't continue if the dirty ring of this vcpu is
+             * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
+             */
+            trace_kvm_dirty_ring_full(cpu->cpu_index);
+            qemu_mutex_lock_iothread();
+            kvm_dirty_ring_reap(kvm_state);
+            qemu_mutex_unlock_iothread();
+            ret = 0;
+            break;
         case KVM_EXIT_SYSTEM_EVENT:
             switch (run->system_event.type) {
             case KVM_SYSTEM_EVENT_SHUTDOWN:
diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events
index e15ae8980d3..72a01320a1a 100644
--- a/accel/kvm/trace-events
+++ b/accel/kvm/trace-events
@@ -18,4 +18,11 @@  kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t
 kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d"
 kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32
 kvm_resample_fd_notify(int gsi) "gsi %d"
+kvm_dirty_ring_full(int id) "vcpu %d"
+kvm_dirty_ring_reap_vcpu(int id) "vcpu %d"
+kvm_dirty_ring_page(int vcpu, uint32_t slot, uint64_t offset) "vcpu %d fetch %"PRIu32" offset 0x%"PRIx64
+kvm_dirty_ring_reaper(const char *s) "%s"
+kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)"
+kvm_dirty_ring_reaper_kick(const char *reason) "%s"
+kvm_dirty_ring_flush(int finished) "%d"
 
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index c68bc3ba8af..2f0991d93f7 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -323,6 +323,11 @@  struct qemu_work_item;
  * @ignore_memory_transaction_failures: Cached copy of the MachineState
  *    flag of the same name: allows the board to suppress calling of the
  *    CPU do_transaction_failed hook function.
+ * @kvm_dirty_ring_full:
+ *   Whether the kvm dirty ring of this vcpu is soft-full.
+ * @kvm_dirty_ring_avail:
+ *   Semaphore to be posted when the kvm dirty ring of the vcpu is
+ *   available again.
  *
  * State of one CPU core or thread.
  */
@@ -394,9 +399,12 @@  struct CPUState {
      */
     uintptr_t mem_io_pc;
 
+    /* Only used in KVM */
     int kvm_fd;
     struct KVMState *kvm_state;
     struct kvm_run *kvm_run;
+    struct kvm_dirty_gfn *kvm_dirty_gfns;
+    uint32_t kvm_fetch_index;
 
     /* Used for events with 'vcpu' and *without* the 'disabled' properties */
     DECLARE_BITMAP(trace_dstate_delayed, CPU_TRACE_DSTATE_MAX_EVENTS);