diff mbox series

[v14,6/7] softmmu/dirtylimit: Implement virtual CPU throttle

Message ID ad0a6e05b5bec2c2c8dd3a7663e39e3cf9af71a3.1644509582.git.huangy81@chinatelecom.cn
State New
Headers show
Series support dirty restraint on vCPU | expand

Commit Message

Hyman Huang Feb. 10, 2022, 4:17 p.m. UTC
From: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>

Setup a negative feedback system when vCPU thread
handling KVM_EXIT_DIRTY_RING_FULL exit by introducing
throttle_us_per_full field in struct CPUState. Sleep
throttle_us_per_full microseconds to throttle vCPU
if dirtylimit is in service.

Signed-off-by: Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
---
 accel/kvm/kvm-all.c         |  14 +-
 include/hw/core/cpu.h       |   6 +
 include/sysemu/dirtylimit.h |  15 +++
 softmmu/dirtylimit.c        | 306 ++++++++++++++++++++++++++++++++++++++++++++
 softmmu/trace-events        |   8 ++
 5 files changed, 348 insertions(+), 1 deletion(-)

Comments

Peter Xu Feb. 14, 2022, 8:20 a.m. UTC | #1
On Fri, Feb 11, 2022 at 12:17:40AM +0800, huangy81@chinatelecom.cn wrote:
> @@ -2964,8 +2971,13 @@ int kvm_cpu_exec(CPUState *cpu)
>               */
>              trace_kvm_dirty_ring_full(cpu->cpu_index);
>              qemu_mutex_lock_iothread();
> -            kvm_dirty_ring_reap(kvm_state, NULL);
> +            if (dirtylimit_in_service()) {
> +                kvm_dirty_ring_reap(kvm_state, cpu);
> +            } else {
> +                kvm_dirty_ring_reap(kvm_state, NULL);
> +            }

Could you add some comment here on why the cpu pointer is conditionally passed
into the reaping routine?  Even if we know it now, it's not immediately obvious
to all the readers.

[...]

> +struct {
> +    VcpuDirtyLimitState *states;
> +    /* Max cpus number configured by user */
> +    int max_cpus;
> +    /* Number of vcpu under dirtylimit */
> +    int limited_nvcpu;
> +    /* Function to implement throttle set up */
> +    DirtyLimitFunc setup;

"setup" normally is used only at startup of something, but not per interval.
Perhaps "process" or "adjust"?  Same across other "setup" namings across the
patch.

Again, I'd rather call the function directly..

[...]

> +static void dirtylimit_adjust_throttle(CPUState *cpu)
> +{
> +    uint64_t quota = 0;
> +    uint64_t current = 0;
> +    int cpu_index = cpu->cpu_index;
> +
> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
> +    current = vcpu_dirty_rate_get(cpu_index);
> +
> +    if (current == 0) {
> +        cpu->throttle_us_per_full = 0;
> +        goto end;

Can be dropped?

> +    } else if (dirtylimit_done(quota, current)) {
> +        goto end;

Same here.  Dropping it wholely and:

       } else if (!dirtylimit_done(quota, current)) {
           dirtylimit_set_throttle(cpu, quota, current);
       }

Would work?

> +    } else {
> +        dirtylimit_set_throttle(cpu, quota, current);
> +    }
> +end:

Can be dropped?

> +    trace_dirtylimit_adjust_throttle(cpu_index,
> +                                     quota, current,
> +                                     cpu->throttle_us_per_full);
> +    return;
> +}
> +
> +void dirtylimit_setup(void)
> +{
> +    CPUState *cpu;
> +
> +    if (!qatomic_read(&dirtylimit_quit)) {
> +        dirtylimit_state_lock();
> +
> +        if (!dirtylimit_in_service()) {
> +            dirtylimit_state_unlock();

Need to return?

> +        }
> +
> +        CPU_FOREACH(cpu) {
> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
> +                continue;
> +            }
> +            dirtylimit_adjust_throttle(cpu);
> +        }
> +        dirtylimit_state_unlock();
> +    }
> +}

[...]

> +void dirtylimit_set_vcpu(int cpu_index,
> +                         uint64_t quota,
> +                         bool enable)
> +{
> +    dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
> +}

This helper is not "help"ful..  How about wrapping the trace into
dirtylimit_vcpu_set_quota, then drop it?

Thanks,
Hyman Huang Feb. 14, 2022, 9:05 a.m. UTC | #2
在 2022/2/14 16:20, Peter Xu 写道:
> On Fri, Feb 11, 2022 at 12:17:40AM +0800, huangy81@chinatelecom.cn wrote:
>> @@ -2964,8 +2971,13 @@ int kvm_cpu_exec(CPUState *cpu)
>>                */
>>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>>               qemu_mutex_lock_iothread();
>> -            kvm_dirty_ring_reap(kvm_state, NULL);
>> +            if (dirtylimit_in_service()) {
>> +                kvm_dirty_ring_reap(kvm_state, cpu);
>> +            } else {
>> +                kvm_dirty_ring_reap(kvm_state, NULL);
>> +            }
> 
> Could you add some comment here on why the cpu pointer is conditionally passed
> into the reaping routine?  Even if we know it now, it's not immediately obvious
> to all the readers.
Sure.
> 
> [...]
> 
>> +struct {
>> +    VcpuDirtyLimitState *states;
>> +    /* Max cpus number configured by user */
>> +    int max_cpus;
>> +    /* Number of vcpu under dirtylimit */
>> +    int limited_nvcpu;
>> +    /* Function to implement throttle set up */
>> +    DirtyLimitFunc setup;
> 
> "setup" normally is used only at startup of something, but not per interval.
> Perhaps "process" or "adjust"?  Same across other "setup" namings across the
> patch.
Ok, 'adjust' is fine.
> 
> Again, I'd rather call the function directly..
Um, maybe using the function pointer is more extensible.

[...]
static void *vcpu_dirty_rate_stat_thread(void *opaque)
{
     rcu_register_thread();

     /* start log sync */
     global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);

     while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
         vcpu_dirty_rate_stat_collect();
         if (dirtylimit_in_service() &&
             dirtylimit_state->setup) {
             dirtylimit_state->setup();
         }
     }

     /* stop log sync */
     global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);

     rcu_unregister_thread();
     return NULL;
}
[...]

Function pointer makes the 'dirtyrate-stat' logic and 'dirtylimit' logic 
kind of decoupled.

But i'm ok if you insist because it's just about how to call the 
'dirtylimit' and doesn't affect the whole logic.

> 
> [...]
> 
>> +static void dirtylimit_adjust_throttle(CPUState *cpu)
>> +{
>> +    uint64_t quota = 0;
>> +    uint64_t current = 0;
>> +    int cpu_index = cpu->cpu_index;
>> +
>> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
>> +    current = vcpu_dirty_rate_get(cpu_index);
>> +
>> +    if (current == 0) {
>> +        cpu->throttle_us_per_full = 0;
>> +        goto end;
> 
> Can be dropped?
> 
>> +    } else if (dirtylimit_done(quota, current)) {
>> +        goto end;
> 
> Same here.  Dropping it wholely and:
> 
>         } else if (!dirtylimit_done(quota, current)) {
>             dirtylimit_set_throttle(cpu, quota, current);
>         }
> 
> Would work?
> 
>> +    } else {
>> +        dirtylimit_set_throttle(cpu, quota, current);
>> +    }
>> +end:
> 
> Can be dropped?
> 
>> +    trace_dirtylimit_adjust_throttle(cpu_index,
>> +                                     quota, current,
>> +                                     cpu->throttle_us_per_full);
>> +    return;
>> +}
>> +
>> +void dirtylimit_setup(void)
>> +{
>> +    CPUState *cpu;
>> +
>> +    if (!qatomic_read(&dirtylimit_quit)) {
>> +        dirtylimit_state_lock();
>> +
>> +        if (!dirtylimit_in_service()) {
>> +            dirtylimit_state_unlock();
> 
> Need to return?
> 
>> +        }
>> +
>> +        CPU_FOREACH(cpu) {
>> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
>> +                continue;
>> +            }
>> +            dirtylimit_adjust_throttle(cpu);
>> +        }
>> +        dirtylimit_state_unlock();
>> +    }
>> +}
> 
> [...]
> 
>> +void dirtylimit_set_vcpu(int cpu_index,
>> +                         uint64_t quota,
>> +                         bool enable)
>> +{
>> +    dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
>> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
>> +}
> 
> This helper is not "help"ful..  How about wrapping the trace into
> dirtylimit_vcpu_set_quota, then drop it?
> 
> Thanks,
>
Hyman Huang Feb. 14, 2022, 9:22 a.m. UTC | #3
在 2022/2/14 16:20, Peter Xu 写道:
> On Fri, Feb 11, 2022 at 12:17:40AM +0800, huangy81@chinatelecom.cn wrote:
>> @@ -2964,8 +2971,13 @@ int kvm_cpu_exec(CPUState *cpu)
>>                */
>>               trace_kvm_dirty_ring_full(cpu->cpu_index);
>>               qemu_mutex_lock_iothread();
>> -            kvm_dirty_ring_reap(kvm_state, NULL);
>> +            if (dirtylimit_in_service()) {
>> +                kvm_dirty_ring_reap(kvm_state, cpu);
>> +            } else {
>> +                kvm_dirty_ring_reap(kvm_state, NULL);
>> +            }
> 
> Could you add some comment here on why the cpu pointer is conditionally passed
> into the reaping routine?  Even if we know it now, it's not immediately obvious
> to all the readers.
> 
> [...]
> 
>> +struct {
>> +    VcpuDirtyLimitState *states;
>> +    /* Max cpus number configured by user */
>> +    int max_cpus;
>> +    /* Number of vcpu under dirtylimit */
>> +    int limited_nvcpu;
>> +    /* Function to implement throttle set up */
>> +    DirtyLimitFunc setup;
> 
> "setup" normally is used only at startup of something, but not per interval.
> Perhaps "process" or "adjust"?  Same across other "setup" namings across the
> patch.
> 
> Again, I'd rather call the function directly..
> 
> [...]
> 
>> +static void dirtylimit_adjust_throttle(CPUState *cpu)
>> +{
>> +    uint64_t quota = 0;
>> +    uint64_t current = 0;
>> +    int cpu_index = cpu->cpu_index;
>> +
>> +    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
>> +    current = vcpu_dirty_rate_get(cpu_index);
>> +
>> +    if (current == 0) {
>> +        cpu->throttle_us_per_full = 0;
>> +        goto end;
> 
> Can be dropped?
Ok, i'll move this block into dirtylimit_set_throttle.
> 
>> +    } else if (dirtylimit_done(quota, current)) {
>> +        goto end;
> 
> Same here.  Dropping it wholely and:
> 
>         } else if (!dirtylimit_done(quota, current)) {
>             dirtylimit_set_throttle(cpu, quota, current);
>         }
> 
> Would work?
Yes.
> 
>> +    } else {
>> +        dirtylimit_set_throttle(cpu, quota, current);
>> +    }
>> +end:
> 
> Can be dropped?Ok
> 
>> +    trace_dirtylimit_adjust_throttle(cpu_index,
>> +                                     quota, current,
>> +                                     cpu->throttle_us_per_full);
>> +    return;
>> +}
>> +
>> +void dirtylimit_setup(void)
>> +{
>> +    CPUState *cpu;
>> +
>> +    if (!qatomic_read(&dirtylimit_quit)) {
>> +        dirtylimit_state_lock();
>> +
>> +        if (!dirtylimit_in_service()) {
>> +            dirtylimit_state_unlock();
> 
> Need to return?
My fault. :(
> 
>> +        }
>> +
>> +        CPU_FOREACH(cpu) {
>> +            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
>> +                continue;
>> +            }
>> +            dirtylimit_adjust_throttle(cpu);
>> +        }
>> +        dirtylimit_state_unlock();
>> +    }
>> +}
> 
> [...]
> 
>> +void dirtylimit_set_vcpu(int cpu_index,
>> +                         uint64_t quota,
>> +                         bool enable)
>> +{
>> +    dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
>> +    trace_dirtylimit_set_vcpu(cpu_index, quota);
>> +}
> 
> This helper is not "help"ful..  How about wrapping the trace into
> dirtylimit_vcpu_set_quota, then drop it?
> 
Ok.
> Thanks,
>
Peter Xu Feb. 14, 2022, 9:51 a.m. UTC | #4
On Mon, Feb 14, 2022 at 05:05:38PM +0800, Hyman Huang wrote:
> But i'm ok if you insist because it's just about how to call the
> 'dirtylimit' and doesn't affect the whole logic.

If you don't have plan to add e.g. another adjust() method then it's pointless.
The hook can be easily added on top when necessary.

But I don't insist - your call. :)  Not really a big deal.
diff mbox series

Patch

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index 8821d80..5ca752b 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@ 
 #include "qemu/guest-random.h"
 #include "sysemu/hw_accel.h"
 #include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
 
 #include "hw/boards.h"
 
@@ -476,6 +477,7 @@  int kvm_init_vcpu(CPUState *cpu, Error **errp)
     cpu->kvm_state = s;
     cpu->vcpu_dirty = true;
     cpu->dirty_pages = 0;
+    cpu->throttle_us_per_full = 0;
 
     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
     if (mmap_size < 0) {
@@ -1469,6 +1471,11 @@  static void *kvm_dirty_ring_reaper_thread(void *data)
          */
         sleep(1);
 
+        /* keep sleeping so that dirtylimit not be interfered by reaper */
+        if (dirtylimit_in_service()) {
+            continue;
+        }
+
         trace_kvm_dirty_ring_reaper("wakeup");
         r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
 
@@ -2964,8 +2971,13 @@  int kvm_cpu_exec(CPUState *cpu)
              */
             trace_kvm_dirty_ring_full(cpu->cpu_index);
             qemu_mutex_lock_iothread();
-            kvm_dirty_ring_reap(kvm_state, NULL);
+            if (dirtylimit_in_service()) {
+                kvm_dirty_ring_reap(kvm_state, cpu);
+            } else {
+                kvm_dirty_ring_reap(kvm_state, NULL);
+            }
             qemu_mutex_unlock_iothread();
+            dirtylimit_vcpu_execute(cpu);
             ret = 0;
             break;
         case KVM_EXIT_SYSTEM_EVENT:
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 76ab3b8..dbeb31a 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -411,6 +411,12 @@  struct CPUState {
      */
     bool throttle_thread_scheduled;
 
+    /*
+     * Sleep throttle_us_per_full microseconds once dirty ring is full
+     * if dirty page rate limit is enabled.
+     */
+    int64_t throttle_us_per_full;
+
     bool ignore_memory_transaction_failures;
 
     /* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index da459f0..4b7effe 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -19,4 +19,19 @@  void vcpu_dirty_rate_stat_start(void);
 void vcpu_dirty_rate_stat_stop(void);
 void vcpu_dirty_rate_stat_initialize(void);
 void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_setup(void);
+void dirtylimit_change(bool start);
+void dirtylimit_set_vcpu(int cpu_index,
+                         uint64_t quota,
+                         bool enable);
+void dirtylimit_set_all(uint64_t quota,
+                        bool enable);
+void dirtylimit_vcpu_execute(CPUState *cpu);
 #endif
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index a10ac6f..8b8d8d7 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -18,6 +18,26 @@ 
 #include "sysemu/dirtylimit.h"
 #include "exec/memory.h"
 #include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE  25  /* MB/s */
+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT     50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
 
 struct {
     VcpuStat stat;
@@ -25,6 +45,34 @@  struct {
     QemuThread thread;
 } *vcpu_dirty_rate_stat;
 
+typedef struct VcpuDirtyLimitState {
+    int cpu_index;
+    bool enabled;
+    /*
+     * Quota dirty page rate, unit is MB/s
+     * zero if not enabled.
+     */
+    uint64_t quota;
+} VcpuDirtyLimitState;
+
+typedef void (*DirtyLimitFunc)(void);
+
+struct {
+    VcpuDirtyLimitState *states;
+    /* Max cpus number configured by user */
+    int max_cpus;
+    /* Number of vcpu under dirtylimit */
+    int limited_nvcpu;
+    /* Function to implement throttle set up */
+    DirtyLimitFunc setup;
+} *dirtylimit_state;
+
+/* protect dirtylimit_state */
+static QemuMutex dirtylimit_mutex;
+
+/* dirtylimit thread quit if dirtylimit_quit is true */
+static bool dirtylimit_quit;
+
 static void vcpu_dirty_rate_stat_collect(void)
 {
     int64_t start_time;
@@ -58,6 +106,10 @@  static void *vcpu_dirty_rate_stat_thread(void *opaque)
 
     while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
         vcpu_dirty_rate_stat_collect();
+        if (dirtylimit_in_service() &&
+            dirtylimit_state->setup) {
+            dirtylimit_state->setup();
+        }
     }
 
     /* stop log sync */
@@ -90,9 +142,11 @@  void vcpu_dirty_rate_stat_start(void)
 void vcpu_dirty_rate_stat_stop(void)
 {
     qatomic_set(&vcpu_dirty_rate_stat->running, 0);
+    dirtylimit_state_unlock();
     qemu_mutex_unlock_iothread();
     qemu_thread_join(&vcpu_dirty_rate_stat->thread);
     qemu_mutex_lock_iothread();
+    dirtylimit_state_lock();
 }
 
 void vcpu_dirty_rate_stat_initialize(void)
@@ -118,3 +172,255 @@  void vcpu_dirty_rate_stat_finalize(void)
     free(vcpu_dirty_rate_stat);
     vcpu_dirty_rate_stat = NULL;
 }
+
+void dirtylimit_state_lock(void)
+{
+    qemu_mutex_lock(&dirtylimit_mutex);
+}
+
+void dirtylimit_state_unlock(void)
+{
+    qemu_mutex_unlock(&dirtylimit_mutex);
+}
+
+static void
+__attribute__((__constructor__)) dirtylimit_mutex_init(void)
+{
+    qemu_mutex_init(&dirtylimit_mutex);
+}
+
+static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
+{
+    return &dirtylimit_state->states[cpu_index];
+}
+
+void dirtylimit_state_initialize(void)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    int max_cpus = ms->smp.max_cpus;
+    int i;
+
+    dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+    dirtylimit_state->states =
+            g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
+
+    for (i = 0; i < max_cpus; i++) {
+        dirtylimit_state->states[i].cpu_index = i;
+    }
+
+    dirtylimit_state->max_cpus = max_cpus;
+    dirtylimit_state->setup = dirtylimit_setup;
+    trace_dirtylimit_state_initialize(max_cpus);
+}
+
+void dirtylimit_state_finalize(void)
+{
+    free(dirtylimit_state->states);
+    dirtylimit_state->states = NULL;
+    dirtylimit_state->setup = NULL;
+
+    free(dirtylimit_state);
+    dirtylimit_state = NULL;
+
+    trace_dirtylimit_state_finalize();
+}
+
+bool dirtylimit_in_service(void)
+{
+    return !!dirtylimit_state;
+}
+
+bool dirtylimit_vcpu_index_valid(int cpu_index)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+
+    return !(cpu_index < 0 ||
+             cpu_index >= ms->smp.max_cpus);
+}
+
+static inline void dirtylimit_vcpu_set_quota(int cpu_index,
+                                             uint64_t quota,
+                                             bool on)
+{
+    if (on) {
+        dirtylimit_state->states[cpu_index].quota = quota;
+        if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
+            dirtylimit_state->limited_nvcpu++;
+        }
+    } else {
+        dirtylimit_state->states[cpu_index].quota = 0;
+        if (dirtylimit_state->states[cpu_index].enabled) {
+            dirtylimit_state->limited_nvcpu--;
+        }
+    }
+
+    dirtylimit_state->states[cpu_index].enabled = on;
+}
+
+static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+{
+    static uint64_t max_dirtyrate;
+    uint32_t dirty_ring_size = kvm_dirty_ring_size();
+    uint64_t dirty_ring_size_meory_MB =
+        dirty_ring_size * TARGET_PAGE_SIZE >> 20;
+
+    if (max_dirtyrate < dirtyrate) {
+        max_dirtyrate = dirtyrate;
+    }
+
+    return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+}
+
+static inline bool dirtylimit_done(uint64_t quota,
+                                   uint64_t current)
+{
+    uint64_t min, max;
+
+    min = MIN(quota, current);
+    max = MAX(quota, current);
+
+    return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
+}
+
+static inline bool
+dirtylimit_need_linear_adjustment(uint64_t quota,
+                                  uint64_t current)
+{
+    uint64_t min, max;
+
+    min = MIN(quota, current);
+    max = MAX(quota, current);
+
+    return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
+}
+
+static void dirtylimit_set_throttle(CPUState *cpu,
+                                    uint64_t quota,
+                                    uint64_t current)
+{
+    int64_t ring_full_time_us = 0;
+    uint64_t sleep_pct = 0;
+    uint64_t throttle_us = 0;
+
+    ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
+
+    if (dirtylimit_need_linear_adjustment(quota, current)) {
+        if (quota < current) {
+            sleep_pct = (current - quota) * 100 / current;
+            throttle_us =
+                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+            cpu->throttle_us_per_full += throttle_us;
+        } else {
+            sleep_pct = (quota - current) * 100 / quota;
+            throttle_us =
+                ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+            cpu->throttle_us_per_full -= throttle_us;
+        }
+
+        trace_dirtylimit_throttle_pct(cpu->cpu_index,
+                                      sleep_pct,
+                                      throttle_us);
+    } else {
+        if (quota < current) {
+            cpu->throttle_us_per_full += ring_full_time_us / 10;
+        } else {
+            cpu->throttle_us_per_full -= ring_full_time_us / 10;
+        }
+    }
+
+    /*
+     * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
+     *       current dirty page rate may never reach the quota, we should stop
+     *       increasing sleep time?
+     */
+    cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
+        ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
+
+    cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
+}
+
+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+    uint64_t quota = 0;
+    uint64_t current = 0;
+    int cpu_index = cpu->cpu_index;
+
+    quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+    current = vcpu_dirty_rate_get(cpu_index);
+
+    if (current == 0) {
+        cpu->throttle_us_per_full = 0;
+        goto end;
+    } else if (dirtylimit_done(quota, current)) {
+        goto end;
+    } else {
+        dirtylimit_set_throttle(cpu, quota, current);
+    }
+end:
+    trace_dirtylimit_adjust_throttle(cpu_index,
+                                     quota, current,
+                                     cpu->throttle_us_per_full);
+    return;
+}
+
+void dirtylimit_setup(void)
+{
+    CPUState *cpu;
+
+    if (!qatomic_read(&dirtylimit_quit)) {
+        dirtylimit_state_lock();
+
+        if (!dirtylimit_in_service()) {
+            dirtylimit_state_unlock();
+        }
+
+        CPU_FOREACH(cpu) {
+            if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+                continue;
+            }
+            dirtylimit_adjust_throttle(cpu);
+        }
+        dirtylimit_state_unlock();
+    }
+}
+
+void dirtylimit_change(bool start)
+{
+    if (start) {
+        qatomic_set(&dirtylimit_quit, 0);
+    } else {
+        qatomic_set(&dirtylimit_quit, 1);
+    }
+}
+
+void dirtylimit_set_vcpu(int cpu_index,
+                         uint64_t quota,
+                         bool enable)
+{
+    dirtylimit_vcpu_set_quota(cpu_index, quota, enable);
+    trace_dirtylimit_set_vcpu(cpu_index, quota);
+}
+
+void dirtylimit_set_all(uint64_t quota,
+                        bool enable)
+{
+    MachineState *ms = MACHINE(qdev_get_machine());
+    int max_cpus = ms->smp.max_cpus;
+    int i;
+
+    for (i = 0; i < max_cpus; i++) {
+        dirtylimit_set_vcpu(i, quota, enable);
+    }
+}
+
+void dirtylimit_vcpu_execute(CPUState *cpu)
+{
+    if (dirtylimit_in_service() &&
+        dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
+        cpu->throttle_us_per_full) {
+        trace_dirtylimit_vcpu_execute(cpu->cpu_index,
+                cpu->throttle_us_per_full);
+        usleep(cpu->throttle_us_per_full);
+    }
+}
diff --git a/softmmu/trace-events b/softmmu/trace-events
index 9c88887..ff441ac 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -31,3 +31,11 @@  runstate_set(int current_state, const char *current_state_str, int new_state, co
 system_wakeup_request(int reason) "reason=%d"
 qemu_system_shutdown_request(int reason) "reason=%d"
 qemu_system_powerdown_request(void) ""
+
+#dirtylimit.c
+dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
+dirtylimit_state_finalize(void)
+dirtylimit_adjust_throttle(int cpu_index, uint64_t quota, uint64_t current, int64_t time_us) "CPU[%d] throttle: quota %" PRIu64 ", current %" PRIu64 ", throttle %"PRIi64 " us"
+dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
+dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
+dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"