Patchwork [for-1.6,2/4] block: Modify the throttling code to implement the leaky bucket algorithm.

login
register
mail settings
Submitter Benoît Canet
Date July 22, 2013, 2:38 p.m.
Message ID <1374503922-27965-3-git-send-email-benoit@irqsave.net>
Download mbox | patch
Permalink /patch/260744/
State New
Headers show

Comments

Benoît Canet - July 22, 2013, 2:38 p.m.
This patch replace the previous algorithm by the well described leaky bucket
algorithm: A bucket is filled by the incoming IOs and a periodic timer decrement
the counter to make the bucket leak. When a given threshold is reached the
bucket is full and the IOs are hold.

In this patch the threshold is set to a default value to make the code behave
like the previous implementation.

In the next patch the threshold will be exposed in QMP to let the user control
the burstiness of the throttling.

Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block.c                   |  410 +++++++++++++++++++++++++--------------------
 blockdev.c                |   71 ++++++--
 include/block/block_int.h |   15 +-
 3 files changed, 299 insertions(+), 197 deletions(-)

Patch

diff --git a/block.c b/block.c
index dc72643..2d6e9b4 100644
--- a/block.c
+++ b/block.c
@@ -86,13 +86,6 @@  static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors);
 
-static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
-        bool is_write, double elapsed_time, uint64_t *wait);
-static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
-        double elapsed_time, uint64_t *wait);
-static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
-        bool is_write, int64_t *wait);
-
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -101,6 +94,8 @@  static QLIST_HEAD(, BlockDriver) bdrv_drivers =
 
 /* If non-zero, use only whitelisted block drivers */
 static int use_bdrv_whitelist;
+/* boolean used to inform the throttling code that a bdrv_drain_all is issued */
+static bool draining;
 
 #ifdef _WIN32
 static int is_windows_drive_prefix(const char *filename)
@@ -135,15 +130,122 @@  void bdrv_io_limits_disable(BlockDriverState *bs)
         qemu_free_timer(bs->block_timer);
         bs->block_timer = NULL;
     }
+}
 
-    bs->slice_start = 0;
-    bs->slice_end   = 0;
+static void bdrv_make_bps_buckets_leak(BlockDriverState *bs, int64_t delta)
+{
+    int64_t *bytes = bs->leaky_buckets.bytes;
+    int64_t read_leak, write_leak;
+
+    /* the limit apply to both reads and writes */
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        /* compute half the total leak */
+        int64_t leak = ((bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] * delta) /
+                       NANOSECONDS_PER_SECOND);
+        int remain = leak % 2;
+        leak /= 2;
+
+        /* the read bucket is smaller than half the quantity to leak so take
+         * care adding the leak difference to write leak
+         */
+        if (bytes[BLOCK_IO_LIMIT_READ] <= leak) {
+            read_leak = bytes[BLOCK_IO_LIMIT_READ];
+            write_leak = 2 * leak + remain - bytes[BLOCK_IO_LIMIT_READ];
+        /* symetric case */
+        } else if (bytes[BLOCK_IO_LIMIT_WRITE] <= leak) {
+            write_leak = bytes[BLOCK_IO_LIMIT_WRITE];
+            read_leak = 2 * leak + remain - bytes[BLOCK_IO_LIMIT_WRITE];
+        /* both bucket above leak count use half the total leak for both */
+        } else {
+            write_leak = leak;
+            read_leak = leak + remain;
+        }
+    /* else we consider that limits are separated */
+    } else {
+        read_leak = (bs->io_limits.bps[BLOCK_IO_LIMIT_READ] * delta) /
+                    NANOSECONDS_PER_SECOND;
+        write_leak = (bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] * delta) /
+                     NANOSECONDS_PER_SECOND;
+    }
+
+    /* make the buckets leak */
+    bytes[BLOCK_IO_LIMIT_READ]  = MAX(bytes[BLOCK_IO_LIMIT_READ] - read_leak,
+                                      0);
+    bytes[BLOCK_IO_LIMIT_WRITE] = MAX(bytes[BLOCK_IO_LIMIT_WRITE] - write_leak,
+                                      0);
 }
 
+static void bdrv_make_iops_buckets_leak(BlockDriverState *bs, int64_t delta)
+{
+    double *ios = bs->leaky_buckets.ios;
+    int64_t read_leak, write_leak;
+
+    /* the limit apply to both reads and writes */
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        /* compute half the total leak */
+        int64_t leak = ((bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] * delta) /
+                       NANOSECONDS_PER_SECOND);
+        int remain = leak % 2;
+        leak /= 2;
+
+        /* the read bucket is smaller than half the quantity to leak so take
+         * care adding the leak difference to write leak
+         */
+        if (ios[BLOCK_IO_LIMIT_READ] <= leak) {
+            read_leak = ios[BLOCK_IO_LIMIT_READ];
+            write_leak = 2 * leak + remain - ios[BLOCK_IO_LIMIT_READ];
+        /* symetric case */
+        } else if (ios[BLOCK_IO_LIMIT_WRITE] <= leak) {
+            write_leak = ios[BLOCK_IO_LIMIT_WRITE];
+            read_leak = 2 * leak + remain - ios[BLOCK_IO_LIMIT_WRITE];
+        /* both bucket above leak count use half the total leak for both */
+        } else {
+            write_leak = leak;
+            read_leak = leak + remain;
+        }
+    /* else we consider that limits are separated */
+    } else {
+        read_leak = (bs->io_limits.iops[BLOCK_IO_LIMIT_READ] * delta) /
+                    NANOSECONDS_PER_SECOND;
+        write_leak = (bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] * delta) /
+                     NANOSECONDS_PER_SECOND;
+    }
+
+    /* make the buckets leak */
+    ios[BLOCK_IO_LIMIT_READ]  = MAX(ios[BLOCK_IO_LIMIT_READ] - read_leak, 0);
+    ios[BLOCK_IO_LIMIT_WRITE] = MAX(ios[BLOCK_IO_LIMIT_WRITE] - write_leak, 0);
+}
+
+static void bdrv_leak_if_needed(BlockDriverState *bs)
+{
+    int64_t now;
+    int64_t delta;
+
+    if (!bs->must_leak) {
+        return;
+    }
+
+    bs->must_leak = false;
+
+    now = qemu_get_clock_ns(rt_clock);
+    delta = now - bs->previous_leak;
+    bs->previous_leak = now;
+
+    bdrv_make_bps_buckets_leak(bs, delta);
+    bdrv_make_iops_buckets_leak(bs, delta);
+}
+
+/* This callback is the timer in charge of making the leaky buckets leak */
 static void bdrv_block_timer(void *opaque)
 {
     BlockDriverState *bs = opaque;
 
+    /* rearm the timer */
+    qemu_mod_timer(bs->block_timer,
+                   qemu_get_clock_ns(vm_clock) +
+                   BLOCK_IO_THROTTLE_PERIOD);
+
+    bs->must_leak = true;
     qemu_co_enter_next(&bs->throttled_reqs);
 }
 
@@ -152,6 +254,10 @@  void bdrv_io_limits_enable(BlockDriverState *bs)
     qemu_co_queue_init(&bs->throttled_reqs);
     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
     bs->io_limits_enabled = true;
+    bs->previous_leak = qemu_get_clock_ns(rt_clock);
+    qemu_mod_timer(bs->block_timer,
+                   qemu_get_clock_ns(vm_clock) +
+                   BLOCK_IO_THROTTLE_PERIOD);
 }
 
 bool bdrv_io_limits_enabled(BlockDriverState *bs)
@@ -165,15 +271,113 @@  bool bdrv_io_limits_enabled(BlockDriverState *bs)
          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
 }
 
+/* This function check if the correct bandwith threshold has been exceeded
+ *
+ * @is_write: true if the current IO is a write, false if it's a read
+ * @ret:      true if threshold has been exceeded else false
+ */
+static bool bdrv_is_bps_threshold_exceeded(BlockDriverState *bs, bool is_write)
+{
+    /* limit is on total read + write bps : do the sum and compare with total
+     * threshold
+     */
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        int64_t bytes = bs->leaky_buckets.bytes[0] +
+                        bs->leaky_buckets.bytes[1];
+        return bs->io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] < bytes;
+    }
+
+    /* check wether the threshold corresponding to the current io type (read,
+     * write has been exceeded
+     */
+    if (bs->io_limits.bps[is_write]) {
+        return bs->io_limits.bps_threshold[is_write] <
+               bs->leaky_buckets.bytes[is_write];
+    }
+
+    /* no limit */
+    return false;
+}
+
+/* This function check if the correct iops threshold has been exceeded
+ *
+ * @is_write: true if the current IO is a write, false if it's a read
+ * @ret:      true if threshold has been exceeded else false
+ */
+static bool bdrv_is_iops_threshold_exceeded(BlockDriverState *bs, bool is_write)
+{
+    /* limit is on total read + write iops : do the sum and compare with total
+     * threshold
+     */
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        double ios = bs->leaky_buckets.ios[0] +
+                     bs->leaky_buckets.ios[1];
+        return bs->io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] < ios;
+    }
+
+    /* check wether the threshold corresponding to the current io type (read,
+     * write has been exceeded
+     */
+    if (bs->io_limits.iops[is_write]) {
+        return bs->io_limits.iops_threshold[is_write] <
+               bs->leaky_buckets.ios[is_write];
+    }
+
+    /* no limit */
+    return false;
+}
+
+/* This function check if any bandwith or iops threshold has been exceeded
+ *
+ * @nb_sectors: the number of sectors of the current IO
+ * @is_write:   true if the current IO is a write, false if it's a read
+ * @ret:        true if any threshold has been exceeded else false
+ */
+static bool bdrv_is_any_threshold_exceeded(BlockDriverState *bs, int nb_sectors,
+                                           bool is_write)
+{
+    bool bps_ret, iops_ret;
+
+    /* check if any bandwith or per IO threshold has been exceeded */
+    bps_ret = bdrv_is_bps_threshold_exceeded(bs, is_write);
+    iops_ret = bdrv_is_iops_threshold_exceeded(bs, is_write);
+
+    /* if so the IO will be blocked so do not account it and return true
+     * also return false if a bdrv_drain_all is in progress
+     */
+    if (!draining && (bps_ret || iops_ret)) {
+        return true;
+    }
+
+    /* NOTE: the counter can go above the threshold when authorizing an IO.
+     *       At next call the code will punish the guest by blocking the
+     *       next IO until the counter has been decremented below the threshold.
+     *       This way if a guest issue a jumbo IO bigger than the threshold it
+     *       will have a chance no be authorized and will not result in a guest
+     *       IO deadlock.
+     */
+
+    /* the IO is authorized so do the accounting and return false */
+    bs->leaky_buckets.bytes[is_write] += (int64_t)nb_sectors *
+                                         BDRV_SECTOR_SIZE;
+    bs->leaky_buckets.ios[is_write]++;
+
+    return false;
+}
+
 static void bdrv_io_limits_intercept(BlockDriverState *bs,
                                      bool is_write, int nb_sectors)
 {
-    int64_t wait_time = -1;
-
+    bdrv_leak_if_needed(bs);
+    /* if some IOs are already queued because the bucket is full put the current
+     * IO at the end of the queue (FIFO)
+     */
     if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
         qemu_co_queue_wait(&bs->throttled_reqs);
     }
 
+    bdrv_leak_if_needed(bs);
+
     /* In fact, we hope to keep each request's timing, in FIFO mode. The next
      * throttled requests will not be dequeued until the current request is
      * allowed to be serviced. So if the current request still exceeds the
@@ -181,13 +385,19 @@  static void bdrv_io_limits_intercept(BlockDriverState *bs,
      * be still in throttled_reqs queue.
      */
 
-    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
-        qemu_mod_timer(bs->block_timer,
-                       wait_time + qemu_get_clock_ns(vm_clock));
+    /* if a threshold is exceeded the leaky bucket is full so the code put the
+     * IO in the throttle_reqs queue until the bucket has leaked enough to be
+     * not full
+     */
+    while (bdrv_is_any_threshold_exceeded(bs, nb_sectors, is_write)) {
+        bdrv_leak_if_needed(bs);
         qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+        bdrv_leak_if_needed(bs);
     }
 
+    bdrv_leak_if_needed(bs);
     qemu_co_queue_next(&bs->throttled_reqs);
+    bdrv_leak_if_needed(bs);
 }
 
 /* check if the path starts with "<protocol>:" */
@@ -1439,6 +1649,9 @@  void bdrv_drain_all(void)
     BlockDriverState *bs;
     bool busy;
 
+    /* tell the throttling code we are draining */
+    draining = true;
+
     do {
         busy = qemu_aio_wait();
 
@@ -1457,6 +1670,8 @@  void bdrv_drain_all(void)
         assert(QLIST_EMPTY(&bs->tracked_requests));
         assert(qemu_co_queue_empty(&bs->throttled_reqs));
     }
+
+    draining = false;
 }
 
 /* make a BlockDriverState anonymous by removing from bdrv_state list.
@@ -1492,9 +1707,7 @@  static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->enable_write_cache = bs_src->enable_write_cache;
 
     /* i/o timing parameters */
-    bs_dest->slice_start        = bs_src->slice_start;
-    bs_dest->slice_end          = bs_src->slice_end;
-    bs_dest->slice_submitted    = bs_src->slice_submitted;
+    bs_dest->leaky_buckets      = bs_src->leaky_buckets;
     bs_dest->io_limits          = bs_src->io_limits;
     bs_dest->throttled_reqs     = bs_src->throttled_reqs;
     bs_dest->block_timer        = bs_src->block_timer;
@@ -3551,169 +3764,6 @@  void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->aiocb_info->cancel(acb);
 }
 
-/* block I/O throttling */
-static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
-                 bool is_write, double elapsed_time, uint64_t *wait)
-{
-    uint64_t bps_limit = 0;
-    uint64_t extension;
-    double   bytes_limit, bytes_base, bytes_res;
-    double   slice_time, wait_time;
-
-    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
-    } else if (bs->io_limits.bps[is_write]) {
-        bps_limit = bs->io_limits.bps[is_write];
-    } else {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    slice_time = bs->slice_end - bs->slice_start;
-    slice_time /= (NANOSECONDS_PER_SECOND);
-    bytes_limit = bps_limit * slice_time;
-    bytes_base  = bs->slice_submitted.bytes[is_write];
-    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bytes_base += bs->slice_submitted.bytes[!is_write];
-    }
-
-    /* bytes_base: the bytes of data which have been read/written; and
-     *             it is obtained from the history statistic info.
-     * bytes_res: the remaining bytes of data which need to be read/written.
-     * (bytes_base + bytes_res) / bps_limit: used to calcuate
-     *             the total time for completing reading/writting all data.
-     */
-    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-
-    if (bytes_base + bytes_res <= bytes_limit) {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    /* Calc approx time to dispatch */
-    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
-
-    /* When the I/O rate at runtime exceeds the limits,
-     * bs->slice_end need to be extended in order that the current statistic
-     * info can be kept until the timer fire, so it is increased and tuned
-     * based on the result of experiment.
-     */
-    extension = wait_time * NANOSECONDS_PER_SECOND;
-    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
-                BLOCK_IO_SLICE_TIME;
-    bs->slice_end += extension;
-    if (wait) {
-        *wait = wait_time * NANOSECONDS_PER_SECOND;
-    }
-
-    return true;
-}
-
-static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
-                             double elapsed_time, uint64_t *wait)
-{
-    uint64_t iops_limit = 0;
-    double   ios_limit, ios_base;
-    double   slice_time, wait_time;
-
-    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
-    } else if (bs->io_limits.iops[is_write]) {
-        iops_limit = bs->io_limits.iops[is_write];
-    } else {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    slice_time = bs->slice_end - bs->slice_start;
-    slice_time /= (NANOSECONDS_PER_SECOND);
-    ios_limit  = iops_limit * slice_time;
-    ios_base   = bs->slice_submitted.ios[is_write];
-    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        ios_base += bs->slice_submitted.ios[!is_write];
-    }
-
-    if (ios_base + 1 <= ios_limit) {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    /* Calc approx time to dispatch, in seconds */
-    wait_time = (ios_base + 1) / iops_limit;
-    if (wait_time > elapsed_time) {
-        wait_time = wait_time - elapsed_time;
-    } else {
-        wait_time = 0;
-    }
-
-    /* Exceeded current slice, extend it by another slice time */
-    bs->slice_end += BLOCK_IO_SLICE_TIME;
-    if (wait) {
-        *wait = wait_time * NANOSECONDS_PER_SECOND;
-    }
-
-    return true;
-}
-
-static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
-                           bool is_write, int64_t *wait)
-{
-    int64_t  now, max_wait;
-    uint64_t bps_wait = 0, iops_wait = 0;
-    double   elapsed_time;
-    int      bps_ret, iops_ret;
-
-    now = qemu_get_clock_ns(vm_clock);
-    if (now > bs->slice_end) {
-        bs->slice_start = now;
-        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
-        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
-    }
-
-    elapsed_time  = now - bs->slice_start;
-    elapsed_time  /= (NANOSECONDS_PER_SECOND);
-
-    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
-                                      is_write, elapsed_time, &bps_wait);
-    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
-                                      elapsed_time, &iops_wait);
-    if (bps_ret || iops_ret) {
-        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
-        if (wait) {
-            *wait = max_wait;
-        }
-
-        now = qemu_get_clock_ns(vm_clock);
-        if (bs->slice_end < now + max_wait) {
-            bs->slice_end = now + max_wait;
-        }
-
-        return true;
-    }
-
-    if (wait) {
-        *wait = 0;
-    }
-
-    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
-                                           BDRV_SECTOR_SIZE;
-    bs->slice_submitted.ios[is_write]++;
-
-    return false;
-}
-
 /**************************************************************/
 /* async block device emulation */
 
diff --git a/blockdev.c b/blockdev.c
index c5abd65..a78fba4 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -280,10 +280,25 @@  static int parse_block_error_action(const char *buf, bool is_read)
     }
 }
 
+static bool invalid(int64_t limit)
+{
+    if (!limit) {
+        return false;
+    }
+
+    if (limit < (THROTTLE_HZ * 2)) {
+        return true;
+    }
+
+    return false;
+}
+
 static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp)
 {
     bool bps_flag;
     bool iops_flag;
+    bool bps_threshold_flag;
+    bool iops_threshold_flag;
 
     assert(io_limits);
 
@@ -299,13 +314,30 @@  static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp)
         return false;
     }
 
-    if (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] < 0 ||
-        io_limits->bps[BLOCK_IO_LIMIT_WRITE] < 0 ||
-        io_limits->bps[BLOCK_IO_LIMIT_READ] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_TOTAL] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_WRITE] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_READ] < 0) {
-        error_setg(errp, "bps and iops values must be 0 or greater");
+    bps_threshold_flag  =
+        (io_limits->bps_threshold[BLOCK_IO_LIMIT_TOTAL] != 0)
+         && ((io_limits->bps_threshold[BLOCK_IO_LIMIT_READ] != 0)
+         || (io_limits->bps_threshold[BLOCK_IO_LIMIT_WRITE] != 0));
+    iops_threshold_flag =
+        (io_limits->iops_threshold[BLOCK_IO_LIMIT_TOTAL] != 0)
+         && ((io_limits->iops_threshold[BLOCK_IO_LIMIT_READ] != 0)
+         || (io_limits->iops_threshold[BLOCK_IO_LIMIT_WRITE] != 0));
+    if (bps_threshold_flag || iops_threshold_flag) {
+        error_setg(errp, "bps_threshold(iops_threshold) and "
+            "bps_rd_threshold/bps_wr_threshold"
+            "(iops_rd_threshold/iops_wr_threshold) "
+            "cannot be used at the same time");
+        return false;
+    }
+
+    if (invalid(io_limits->bps[BLOCK_IO_LIMIT_TOTAL]) ||
+        invalid(io_limits->bps[BLOCK_IO_LIMIT_WRITE]) ||
+        invalid(io_limits->bps[BLOCK_IO_LIMIT_READ]) ||
+        invalid(io_limits->iops[BLOCK_IO_LIMIT_TOTAL]) ||
+        invalid(io_limits->iops[BLOCK_IO_LIMIT_WRITE]) ||
+        invalid(io_limits->iops[BLOCK_IO_LIMIT_READ])) {
+        error_setg(errp, "bps and iops values must be %i or greater",
+                   THROTTLE_HZ * 2);
         return false;
     }
 
@@ -497,6 +529,18 @@  DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type)
                            qemu_opt_get_number(opts, "iops_rd", 0);
     io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
                            qemu_opt_get_number(opts, "iops_wr", 0);
+    io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] =
+                           io_limits.bps[BLOCK_IO_LIMIT_TOTAL] / THROTTLE_HZ;
+    io_limits.bps_threshold[BLOCK_IO_LIMIT_READ]  =
+                           io_limits.bps[BLOCK_IO_LIMIT_READ] / THROTTLE_HZ;
+    io_limits.bps_threshold[BLOCK_IO_LIMIT_WRITE] =
+                           io_limits.bps[BLOCK_IO_LIMIT_WRITE] / THROTTLE_HZ;
+    io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] =
+                           io_limits.iops[BLOCK_IO_LIMIT_TOTAL] / THROTTLE_HZ;
+    io_limits.iops_threshold[BLOCK_IO_LIMIT_READ]  =
+                           io_limits.iops[BLOCK_IO_LIMIT_READ] / THROTTLE_HZ;
+    io_limits.iops_threshold[BLOCK_IO_LIMIT_WRITE] =
+                           io_limits.iops[BLOCK_IO_LIMIT_WRITE] / THROTTLE_HZ;
 
     if (!do_check_io_limits(&io_limits, &error)) {
         error_report("%s", error_get_pretty(error));
@@ -1198,6 +1242,12 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
     io_limits.iops[BLOCK_IO_LIMIT_TOTAL]= iops;
     io_limits.iops[BLOCK_IO_LIMIT_READ] = iops_rd;
     io_limits.iops[BLOCK_IO_LIMIT_WRITE]= iops_wr;
+    io_limits.bps_threshold[BLOCK_IO_LIMIT_TOTAL] = bps / THROTTLE_HZ;
+    io_limits.bps_threshold[BLOCK_IO_LIMIT_READ]  = bps_rd / THROTTLE_HZ;
+    io_limits.bps_threshold[BLOCK_IO_LIMIT_WRITE] = bps_wr / THROTTLE_HZ;
+    io_limits.iops_threshold[BLOCK_IO_LIMIT_TOTAL] = iops / THROTTLE_HZ;
+    io_limits.iops_threshold[BLOCK_IO_LIMIT_READ]  = iops_rd / THROTTLE_HZ;
+    io_limits.iops_threshold[BLOCK_IO_LIMIT_WRITE] = iops_wr / THROTTLE_HZ;
 
     if (!do_check_io_limits(&io_limits, errp)) {
         return;
@@ -1209,11 +1259,10 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
         bdrv_io_limits_enable(bs);
     } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
         bdrv_io_limits_disable(bs);
-    } else {
-        if (bs->block_timer) {
-            qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock));
-        }
     }
+
+    /* reset leaky bucket to get the system in a known state */
+    memset(&bs->leaky_buckets, 0, sizeof(bs->leaky_buckets));
 }
 
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
diff --git a/include/block/block_int.h b/include/block/block_int.h
index c6ac871..e32ad1f 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -43,8 +43,9 @@ 
 #define BLOCK_IO_LIMIT_WRITE    1
 #define BLOCK_IO_LIMIT_TOTAL    2
 
-#define BLOCK_IO_SLICE_TIME     100000000
 #define NANOSECONDS_PER_SECOND  1000000000.0
+#define THROTTLE_HZ 1
+#define BLOCK_IO_THROTTLE_PERIOD (NANOSECONDS_PER_SECOND / THROTTLE_HZ)
 
 #define BLOCK_OPT_SIZE              "size"
 #define BLOCK_OPT_ENCRYPT           "encryption"
@@ -73,11 +74,13 @@  typedef struct BdrvTrackedRequest {
 typedef struct BlockIOLimit {
     int64_t bps[3];
     int64_t iops[3];
+    int64_t bps_threshold[3];
+    int64_t iops_threshold[3];
 } BlockIOLimit;
 
 typedef struct BlockIOBaseValue {
-    uint64_t bytes[2];
-    uint64_t ios[2];
+    int64_t bytes[2];
+    double  ios[2];
 } BlockIOBaseValue;
 
 struct BlockDriver {
@@ -264,10 +267,10 @@  struct BlockDriverState {
     unsigned int copy_on_read_in_flight;
 
     /* the time for latest disk I/O */
-    int64_t slice_start;
-    int64_t slice_end;
     BlockIOLimit io_limits;
-    BlockIOBaseValue slice_submitted;
+    BlockIOBaseValue leaky_buckets;
+    int64_t      previous_leak;
+    bool         must_leak;
     CoQueue      throttled_reqs;
     QEMUTimer    *block_timer;
     bool         io_limits_enabled;