diff mbox

[V4,3/5] block: Enable the new throttling code in the block layer.

Message ID 1375972194-3090-4-git-send-email-benoit@irqsave.net
State New
Headers show

Commit Message

Benoît Canet Aug. 8, 2013, 2:29 p.m. UTC
tip: Do not ever use the cfg scheduler in the guest with this code.
     It gives incorrect throttling.

Signed-off-by: Benoit Canet <benoit@irqsave.net>
---
 block.c                   |  351 ++++++++++++++-------------------------------
 block/qapi.c              |   21 ++-
 blockdev.c                |  100 +++++++------
 include/block/block.h     |    1 -
 include/block/block_int.h |   32 +----
 5 files changed, 173 insertions(+), 332 deletions(-)

Comments

Paolo Bonzini Aug. 8, 2013, 3:30 p.m. UTC | #1
Il 08/08/2013 16:29, Benoît Canet ha scritto:
> tip: Do not ever use the cfg scheduler in the guest with this code.
>      It gives incorrect throttling.

This is not really accurate; the cfq scheduler reorders reads and writes
to have longer bursts, and these sometimes exceed the rate you set.  I
understood this is mostly for separate rd/wr settings, or did you
reproduce it with "total" rates too?

Also, it would be better to have a workaround for this.  Perhaps we
could simply make the default value of max nonzero?  In the old
throttling code the slice time is 0.1s, so perhaps we could see what
happens with max=0.1*avg.

Paolo
Benoît Canet Aug. 8, 2013, 3:43 p.m. UTC | #2
> This is not really accurate; the cfq scheduler reorders reads and writes
> to have longer bursts, and these sometimes exceed the rate you set.  I
> understood this is mostly for separate rd/wr settings, or did you
> reproduce it with "total" rates too?

I'll drop the comment.

It's only with read/write settings and not with total.

> 
> Also, it would be better to have a workaround for this.  Perhaps we
> could simply make the default value of max nonzero?  In the old
> throttling code the slice time is 0.1s, so perhaps we could see what
> happens with max=0.1*avg.

I already set max to 0.1*avg if needed in the code.
Benoît Canet Aug. 8, 2013, 3:56 p.m. UTC | #3
> Also, it would be better to have a workaround for this.  Perhaps we
> could simply make the default value of max nonzero?  In the old
> throttling code the slice time is 0.1s, so perhaps we could see what
> happens with max=0.1*avg.

This gives correct results with cfg with some little auto compensated oscillations.

I will definitelly drop the bogus comment.

Best regards

Benoît
diff mbox

Patch

diff --git a/block.c b/block.c
index 01b66d8..92f0aa1 100644
--- a/block.c
+++ b/block.c
@@ -86,13 +86,6 @@  static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors);
 
-static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
-        bool is_write, double elapsed_time, uint64_t *wait);
-static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
-        double elapsed_time, uint64_t *wait);
-static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
-        bool is_write, int64_t *wait);
-
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -123,70 +116,110 @@  int is_windows_drive(const char *filename)
 #endif
 
 /* throttling disk I/O limits */
-void bdrv_io_limits_disable(BlockDriverState *bs)
+void bdrv_set_io_limits(BlockDriverState *bs,
+                        ThrottleConfig *cfg)
 {
-    bs->io_limits_enabled = false;
+    int i;
+
+    throttle_config(&bs->throttle_state, cfg);
+
+    for (i = 0; i < 2; i++) {
+        qemu_co_enter_next(&bs->throttled_reqs[i]);
+    }
+}
+
+/* this function drain all the throttled IOs */
+static bool bdrv_drain_throttled(BlockDriverState *bs)
+{
+    bool drained = false;
+    bool enabled = bs->io_limits_enabled;
+    int i;
 
-    do {} while (qemu_co_enter_next(&bs->throttled_reqs));
+    bs->io_limits_enabled = false;
 
-    if (bs->block_timer) {
-        qemu_del_timer(bs->block_timer);
-        qemu_free_timer(bs->block_timer);
-        bs->block_timer = NULL;
+    for (i = 0; i < 2; i++) {
+        while (qemu_co_enter_next(&bs->throttled_reqs[i])) {
+            drained = true;
+        }
     }
 
-    bs->slice_start = 0;
-    bs->slice_end   = 0;
+    bs->io_limits_enabled = enabled;
+
+    return drained;
 }
 
-static void bdrv_block_timer(void *opaque)
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    bdrv_drain_throttled(bs);
+
+    throttle_destroy(&bs->throttle_state);
+}
+
+static void bdrv_throttle_read_timer_cb(void *opaque)
 {
     BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[0]);
+}
 
-    qemu_co_enter_next(&bs->throttled_reqs);
+static void bdrv_throttle_write_timer_cb(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    qemu_co_enter_next(&bs->throttled_reqs[1]);
 }
 
+/* should be called before bdrv_set_io_limits if a limit is set */
 void bdrv_io_limits_enable(BlockDriverState *bs)
 {
-    qemu_co_queue_init(&bs->throttled_reqs);
-    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+    throttle_init(&bs->throttle_state,
+                  vm_clock,
+                  bdrv_throttle_read_timer_cb,
+                  bdrv_throttle_write_timer_cb,
+                  bs);
+    qemu_co_queue_init(&bs->throttled_reqs[0]);
+    qemu_co_queue_init(&bs->throttled_reqs[1]);
     bs->io_limits_enabled = true;
 }
 
-bool bdrv_io_limits_enabled(BlockDriverState *bs)
+/* This function make an IO wait if needed
+ *
+ * @is_write:   is the IO a write
+ * @nb_sectors: the number of sectors of the IO
+ */
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     bool is_write,
+                                     int nb_sectors)
 {
-    BlockIOLimit *io_limits = &bs->io_limits;
-    return io_limits->bps[BLOCK_IO_LIMIT_READ]
-         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
-         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
-         || io_limits->iops[BLOCK_IO_LIMIT_READ]
-         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
-         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+    /* does this io must wait */
+    bool must_wait = !throttle_allowed(&bs->throttle_state, is_write);
+
+    /* if must wait or any request of this type throttled queue the IO */
+    if (must_wait ||
+        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    }
+
+    /* the IO will be executed do the accounting */
+    throttle_account(&bs->throttle_state, is_write, nb_sectors);
 }
 
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     bool is_write, int nb_sectors)
+/* This function will schedule the next IO throttled IO of this type if needed
+ *
+ * @is_write: is the IO a write
+ */
+static void bdrv_io_limits_resched(BlockDriverState *bs, bool is_write)
 {
-    int64_t wait_time = -1;
 
-    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
-        qemu_co_queue_wait(&bs->throttled_reqs);
+    if (qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
+        return;
     }
 
-    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
-     * throttled requests will not be dequeued until the current request is
-     * allowed to be serviced. So if the current request still exceeds the
-     * limits, it will be inserted to the head. All requests followed it will
-     * be still in throttled_reqs queue.
-     */
-
-    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
-        qemu_mod_timer(bs->block_timer,
-                       wait_time + qemu_get_clock_ns(vm_clock));
-        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+    if (!throttle_allowed(&bs->throttle_state, is_write)) {
+        return;
     }
 
-    qemu_co_queue_next(&bs->throttled_reqs);
+    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
 }
 
 /* check if the path starts with "<protocol>:" */
@@ -1112,11 +1145,6 @@  int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options,
         bdrv_dev_change_media_cb(bs, true);
     }
 
-    /* throttling disk I/O limits */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_enable(bs);
-    }
-
     return 0;
 
 unlink_and_fail:
@@ -1452,16 +1480,15 @@  void bdrv_drain_all(void)
          * a busy wait.
          */
         QTAILQ_FOREACH(bs, &bdrv_states, list) {
-            while (qemu_co_enter_next(&bs->throttled_reqs)) {
-                busy = true;
-            }
+            busy = bdrv_drain_throttled(bs);
         }
     } while (busy);
 
     /* If requests are still pending there is a bug somewhere */
     QTAILQ_FOREACH(bs, &bdrv_states, list) {
         assert(QLIST_EMPTY(&bs->tracked_requests));
-        assert(qemu_co_queue_empty(&bs->throttled_reqs));
+        assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
+        assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
     }
 }
 
@@ -1497,13 +1524,12 @@  static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
 
     bs_dest->enable_write_cache = bs_src->enable_write_cache;
 
-    /* i/o timing parameters */
-    bs_dest->slice_start        = bs_src->slice_start;
-    bs_dest->slice_end          = bs_src->slice_end;
-    bs_dest->slice_submitted    = bs_src->slice_submitted;
-    bs_dest->io_limits          = bs_src->io_limits;
-    bs_dest->throttled_reqs     = bs_src->throttled_reqs;
-    bs_dest->block_timer        = bs_src->block_timer;
+    /* i/o throttled req */
+    memcpy(&bs_dest->throttle_state,
+           &bs_src->throttle_state,
+           sizeof(ThrottleState));
+    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
+    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
 
     /* r/w error */
@@ -1550,7 +1576,7 @@  void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     assert(bs_new->dev == NULL);
     assert(bs_new->in_use == 0);
     assert(bs_new->io_limits_enabled == false);
-    assert(bs_new->block_timer == NULL);
+    assert(!throttle_have_timer(&bs_new->throttle_state));
 
     tmp = *bs_new;
     *bs_new = *bs_old;
@@ -1569,7 +1595,7 @@  void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old)
     assert(bs_new->job == NULL);
     assert(bs_new->in_use == 0);
     assert(bs_new->io_limits_enabled == false);
-    assert(bs_new->block_timer == NULL);
+    assert(!throttle_have_timer(&bs_new->throttle_state));
 
     bdrv_rebind(bs_new);
     bdrv_rebind(bs_old);
@@ -1860,8 +1886,15 @@  int bdrv_commit_all(void)
  *
  * This function should be called when a tracked request is completing.
  */
-static void tracked_request_end(BdrvTrackedRequest *req)
+static void tracked_request_end(BlockDriverState *bs,
+                                BdrvTrackedRequest *req,
+                                bool is_write)
 {
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_resched(bs, is_write);
+    }
+
     QLIST_REMOVE(req, list);
     qemu_co_queue_restart_all(&req->wait_queue);
 }
@@ -1874,6 +1907,11 @@  static void tracked_request_begin(BdrvTrackedRequest *req,
                                   int64_t sector_num,
                                   int nb_sectors, bool is_write)
 {
+    /* throttling disk I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, is_write, nb_sectors);
+    }
+
     *req = (BdrvTrackedRequest){
         .bs = bs,
         .sector_num = sector_num,
@@ -2512,11 +2550,6 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         return -EIO;
     }
 
-    /* throttling disk read I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, false, nb_sectors);
-    }
-
     if (bs->copy_on_read) {
         flags |= BDRV_REQ_COPY_ON_READ;
     }
@@ -2547,7 +2580,7 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 
 out:
-    tracked_request_end(&req);
+    tracked_request_end(bs, &req, false);
 
     if (flags & BDRV_REQ_COPY_ON_READ) {
         bs->copy_on_read_in_flight--;
@@ -2625,11 +2658,6 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EIO;
     }
 
-    /* throttling disk write I/O */
-    if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, true, nb_sectors);
-    }
-
     if (bs->copy_on_read_in_flight) {
         wait_for_overlapping_requests(bs, sector_num, nb_sectors);
     }
@@ -2658,7 +2686,7 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 
-    tracked_request_end(&req);
+    tracked_request_end(bs, &req, true);
 
     return ret;
 }
@@ -2751,14 +2779,6 @@  void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr)
     *nb_sectors_ptr = length;
 }
 
-/* throttling disk io limits */
-void bdrv_set_io_limits(BlockDriverState *bs,
-                        BlockIOLimit *io_limits)
-{
-    bs->io_limits = *io_limits;
-    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
-}
-
 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
                        BlockdevOnError on_write_error)
 {
@@ -3568,169 +3588,6 @@  void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->aiocb_info->cancel(acb);
 }
 
-/* block I/O throttling */
-static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
-                 bool is_write, double elapsed_time, uint64_t *wait)
-{
-    uint64_t bps_limit = 0;
-    uint64_t extension;
-    double   bytes_limit, bytes_base, bytes_res;
-    double   slice_time, wait_time;
-
-    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
-    } else if (bs->io_limits.bps[is_write]) {
-        bps_limit = bs->io_limits.bps[is_write];
-    } else {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    slice_time = bs->slice_end - bs->slice_start;
-    slice_time /= (NANOSECONDS_PER_SECOND);
-    bytes_limit = bps_limit * slice_time;
-    bytes_base  = bs->slice_submitted.bytes[is_write];
-    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bytes_base += bs->slice_submitted.bytes[!is_write];
-    }
-
-    /* bytes_base: the bytes of data which have been read/written; and
-     *             it is obtained from the history statistic info.
-     * bytes_res: the remaining bytes of data which need to be read/written.
-     * (bytes_base + bytes_res) / bps_limit: used to calcuate
-     *             the total time for completing reading/writting all data.
-     */
-    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-
-    if (bytes_base + bytes_res <= bytes_limit) {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    /* Calc approx time to dispatch */
-    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
-
-    /* When the I/O rate at runtime exceeds the limits,
-     * bs->slice_end need to be extended in order that the current statistic
-     * info can be kept until the timer fire, so it is increased and tuned
-     * based on the result of experiment.
-     */
-    extension = wait_time * NANOSECONDS_PER_SECOND;
-    extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) *
-                BLOCK_IO_SLICE_TIME;
-    bs->slice_end += extension;
-    if (wait) {
-        *wait = wait_time * NANOSECONDS_PER_SECOND;
-    }
-
-    return true;
-}
-
-static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
-                             double elapsed_time, uint64_t *wait)
-{
-    uint64_t iops_limit = 0;
-    double   ios_limit, ios_base;
-    double   slice_time, wait_time;
-
-    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
-    } else if (bs->io_limits.iops[is_write]) {
-        iops_limit = bs->io_limits.iops[is_write];
-    } else {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    slice_time = bs->slice_end - bs->slice_start;
-    slice_time /= (NANOSECONDS_PER_SECOND);
-    ios_limit  = iops_limit * slice_time;
-    ios_base   = bs->slice_submitted.ios[is_write];
-    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        ios_base += bs->slice_submitted.ios[!is_write];
-    }
-
-    if (ios_base + 1 <= ios_limit) {
-        if (wait) {
-            *wait = 0;
-        }
-
-        return false;
-    }
-
-    /* Calc approx time to dispatch, in seconds */
-    wait_time = (ios_base + 1) / iops_limit;
-    if (wait_time > elapsed_time) {
-        wait_time = wait_time - elapsed_time;
-    } else {
-        wait_time = 0;
-    }
-
-    /* Exceeded current slice, extend it by another slice time */
-    bs->slice_end += BLOCK_IO_SLICE_TIME;
-    if (wait) {
-        *wait = wait_time * NANOSECONDS_PER_SECOND;
-    }
-
-    return true;
-}
-
-static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
-                           bool is_write, int64_t *wait)
-{
-    int64_t  now, max_wait;
-    uint64_t bps_wait = 0, iops_wait = 0;
-    double   elapsed_time;
-    int      bps_ret, iops_ret;
-
-    now = qemu_get_clock_ns(vm_clock);
-    if (now > bs->slice_end) {
-        bs->slice_start = now;
-        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
-        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
-    }
-
-    elapsed_time  = now - bs->slice_start;
-    elapsed_time  /= (NANOSECONDS_PER_SECOND);
-
-    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
-                                      is_write, elapsed_time, &bps_wait);
-    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
-                                      elapsed_time, &iops_wait);
-    if (bps_ret || iops_ret) {
-        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
-        if (wait) {
-            *wait = max_wait;
-        }
-
-        now = qemu_get_clock_ns(vm_clock);
-        if (bs->slice_end < now + max_wait) {
-            bs->slice_end = now + max_wait;
-        }
-
-        return true;
-    }
-
-    if (wait) {
-        *wait = 0;
-    }
-
-    bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors *
-                                           BDRV_SECTOR_SIZE;
-    bs->slice_submitted.ios[is_write]++;
-
-    return false;
-}
-
 /**************************************************************/
 /* async block device emulation */
 
diff --git a/block/qapi.c b/block/qapi.c
index a4bc411..45f806b 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -223,18 +223,15 @@  void bdrv_query_info(BlockDriverState *bs,
         info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs);
 
         if (bs->io_limits_enabled) {
-            info->inserted->bps =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
-            info->inserted->bps_rd =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_READ];
-            info->inserted->bps_wr =
-                           bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE];
-            info->inserted->iops =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
-            info->inserted->iops_rd =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_READ];
-            info->inserted->iops_wr =
-                           bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE];
+            ThrottleConfig cfg;
+            throttle_get_config(&bs->throttle_state, &cfg);
+            info->inserted->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].ups;
+            info->inserted->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].ups;
+            info->inserted->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].ups;
+
+            info->inserted->iops    = cfg.buckets[THROTTLE_OPS_TOTAL].ups;
+            info->inserted->iops_rd = cfg.buckets[THROTTLE_OPS_READ].ups;
+            info->inserted->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].ups;
         }
 
         bs0 = bs;
diff --git a/blockdev.c b/blockdev.c
index 41b0a49..7559ea5 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -281,32 +281,16 @@  static int parse_block_error_action(const char *buf, bool is_read)
     }
 }
 
-static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp)
+static bool check_throttle_config(ThrottleConfig *cfg, Error **errp)
 {
-    bool bps_flag;
-    bool iops_flag;
-
-    assert(io_limits);
-
-    bps_flag  = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0)
-                 && ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0)
-                 || (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0));
-    iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0)
-                 && ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0)
-                 || (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0));
-    if (bps_flag || iops_flag) {
-        error_setg(errp, "bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) "
+    if (throttle_conflicting(cfg)) {
+        error_setg(errp, "bps/iops/max total values and read/write values"
                          "cannot be used at the same time");
         return false;
     }
 
-    if (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] < 0 ||
-        io_limits->bps[BLOCK_IO_LIMIT_WRITE] < 0 ||
-        io_limits->bps[BLOCK_IO_LIMIT_READ] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_TOTAL] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_WRITE] < 0 ||
-        io_limits->iops[BLOCK_IO_LIMIT_READ] < 0) {
-        error_setg(errp, "bps and iops values must be 0 or greater");
+    if (!throttle_is_valid(cfg)) {
+        error_setg(errp, "bps/iops/maxs values must be 0 or greater");
         return false;
     }
 
@@ -331,7 +315,7 @@  static DriveInfo *blockdev_init(QemuOpts *all_opts,
     int on_read_error, on_write_error;
     const char *devaddr;
     DriveInfo *dinfo;
-    BlockIOLimit io_limits;
+    ThrottleConfig cfg;
     int snapshot = 0;
     bool copy_on_read;
     int ret;
@@ -489,20 +473,31 @@  static DriveInfo *blockdev_init(QemuOpts *all_opts,
     }
 
     /* disk I/O throttling */
-    io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  =
+    cfg.buckets[THROTTLE_BPS_TOTAL].ups =
         qemu_opt_get_number(opts, "throttling.bps-total", 0);
-    io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+    cfg.buckets[THROTTLE_BPS_READ].ups  =
         qemu_opt_get_number(opts, "throttling.bps-read", 0);
-    io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+    cfg.buckets[THROTTLE_BPS_WRITE].ups =
         qemu_opt_get_number(opts, "throttling.bps-write", 0);
-    io_limits.iops[BLOCK_IO_LIMIT_TOTAL] =
+    cfg.buckets[THROTTLE_OPS_TOTAL].ups =
         qemu_opt_get_number(opts, "throttling.iops-total", 0);
-    io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+    cfg.buckets[THROTTLE_OPS_READ].ups =
         qemu_opt_get_number(opts, "throttling.iops-read", 0);
-    io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+    cfg.buckets[THROTTLE_OPS_WRITE].ups =
         qemu_opt_get_number(opts, "throttling.iops-write", 0);
 
-    if (!do_check_io_limits(&io_limits, &error)) {
+    cfg.buckets[THROTTLE_BPS_TOTAL].max = 0;
+    cfg.buckets[THROTTLE_BPS_READ].max  = 0;
+    cfg.buckets[THROTTLE_BPS_WRITE].max = 0;
+
+    cfg.buckets[THROTTLE_OPS_TOTAL].max = 0;
+    cfg.buckets[THROTTLE_OPS_READ].max  = 0;
+    cfg.buckets[THROTTLE_OPS_WRITE].max = 0;
+
+    cfg.unit_size = BDRV_SECTOR_SIZE;
+    cfg.op_size = 0;
+
+    if (!check_throttle_config(&cfg, &error)) {
         error_report("%s", error_get_pretty(error));
         error_free(error);
         return NULL;
@@ -629,7 +624,10 @@  static DriveInfo *blockdev_init(QemuOpts *all_opts,
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
 
     /* disk I/O throttling */
-    bdrv_set_io_limits(dinfo->bdrv, &io_limits);
+    if (throttle_enabled(&cfg)) {
+        bdrv_io_limits_enable(dinfo->bdrv);
+        bdrv_set_io_limits(dinfo->bdrv, &cfg);
+    }
 
     switch(type) {
     case IF_IDE:
@@ -1262,7 +1260,7 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
                                int64_t bps_wr, int64_t iops, int64_t iops_rd,
                                int64_t iops_wr, Error **errp)
 {
-    BlockIOLimit io_limits;
+    ThrottleConfig cfg;
     BlockDriverState *bs;
 
     bs = bdrv_find(device);
@@ -1271,27 +1269,37 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
         return;
     }
 
-    io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = bps;
-    io_limits.bps[BLOCK_IO_LIMIT_READ]  = bps_rd;
-    io_limits.bps[BLOCK_IO_LIMIT_WRITE] = bps_wr;
-    io_limits.iops[BLOCK_IO_LIMIT_TOTAL]= iops;
-    io_limits.iops[BLOCK_IO_LIMIT_READ] = iops_rd;
-    io_limits.iops[BLOCK_IO_LIMIT_WRITE]= iops_wr;
+    cfg.buckets[THROTTLE_BPS_TOTAL].ups = bps;
+    cfg.buckets[THROTTLE_BPS_READ].ups  = bps_rd;
+    cfg.buckets[THROTTLE_BPS_WRITE].ups = bps_wr;
+
+    cfg.buckets[THROTTLE_OPS_TOTAL].ups = iops;
+    cfg.buckets[THROTTLE_OPS_READ].ups  = iops_rd;
+    cfg.buckets[THROTTLE_OPS_WRITE].ups = iops_wr;
+
+    cfg.buckets[THROTTLE_BPS_TOTAL].max = 0;
+    cfg.buckets[THROTTLE_BPS_READ].max  = 0;
+    cfg.buckets[THROTTLE_BPS_WRITE].max = 0;
+
+    cfg.buckets[THROTTLE_OPS_TOTAL].max = 0;
+    cfg.buckets[THROTTLE_OPS_READ].max  = 0;
+    cfg.buckets[THROTTLE_OPS_WRITE].max = 0;
 
-    if (!do_check_io_limits(&io_limits, errp)) {
+    cfg.unit_size = BDRV_SECTOR_SIZE;
+    cfg.op_size = 0;
+
+    if (!check_throttle_config(&cfg, errp)) {
         return;
     }
 
-    bs->io_limits = io_limits;
-
-    if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
+    if (!bs->io_limits_enabled && throttle_enabled(&cfg)) {
         bdrv_io_limits_enable(bs);
-    } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
+    } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
         bdrv_io_limits_disable(bs);
-    } else {
-        if (bs->block_timer) {
-            qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock));
-        }
+    }
+
+    if (bs->io_limits_enabled) {
+        bdrv_set_io_limits(bs, &cfg);
     }
 }
 
diff --git a/include/block/block.h b/include/block/block.h
index 742fce5..b16d579 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -107,7 +107,6 @@  void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 /* disk I/O throttling */
 void bdrv_io_limits_enable(BlockDriverState *bs);
 void bdrv_io_limits_disable(BlockDriverState *bs);
-bool bdrv_io_limits_enabled(BlockDriverState *bs);
 
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e45f2a0..a0b6bc1 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -34,18 +34,12 @@ 
 #include "monitor/monitor.h"
 #include "qemu/hbitmap.h"
 #include "block/snapshot.h"
+#include "qemu/throttle.h"
 
 #define BLOCK_FLAG_ENCRYPT          1
 #define BLOCK_FLAG_COMPAT6          4
 #define BLOCK_FLAG_LAZY_REFCOUNTS   8
 
-#define BLOCK_IO_LIMIT_READ     0
-#define BLOCK_IO_LIMIT_WRITE    1
-#define BLOCK_IO_LIMIT_TOTAL    2
-
-#define BLOCK_IO_SLICE_TIME     100000000
-#define NANOSECONDS_PER_SECOND  1000000000.0
-
 #define BLOCK_OPT_SIZE              "size"
 #define BLOCK_OPT_ENCRYPT           "encryption"
 #define BLOCK_OPT_COMPAT6           "compat6"
@@ -69,17 +63,6 @@  typedef struct BdrvTrackedRequest {
     CoQueue wait_queue; /* coroutines blocked on this request */
 } BdrvTrackedRequest;
 
-
-typedef struct BlockIOLimit {
-    int64_t bps[3];
-    int64_t iops[3];
-} BlockIOLimit;
-
-typedef struct BlockIOBaseValue {
-    uint64_t bytes[2];
-    uint64_t ios[2];
-} BlockIOBaseValue;
-
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -263,13 +246,9 @@  struct BlockDriverState {
     /* number of in-flight copy-on-read requests */
     unsigned int copy_on_read_in_flight;
 
-    /* the time for latest disk I/O */
-    int64_t slice_start;
-    int64_t slice_end;
-    BlockIOLimit io_limits;
-    BlockIOBaseValue slice_submitted;
-    CoQueue      throttled_reqs;
-    QEMUTimer    *block_timer;
+    /* I/O throttling */
+    ThrottleState throttle_state;
+    CoQueue      throttled_reqs[2];
     bool         io_limits_enabled;
 
     /* I/O stats (display with "info blockstats"). */
@@ -308,7 +287,8 @@  struct BlockDriverState {
 int get_tmp_filename(char *filename, int size);
 
 void bdrv_set_io_limits(BlockDriverState *bs,
-                        BlockIOLimit *io_limits);
+                        ThrottleConfig *cfg);
+
 
 /**
  * bdrv_add_before_write_notifier: