Patchwork [v12,3/5] block: add I/O throttling algorithm

login
register
mail settings
Submitter Zhi Yong Wu
Date Nov. 3, 2011, 8:57 a.m.
Message ID <1320310649-7639-4-git-send-email-wuzhy@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/123417/
State New
Headers show

Comments

Zhi Yong Wu - Nov. 3, 2011, 8:57 a.m.
Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block.c     |  220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block.h     |    1 +
 block_int.h |    1 +
 3 files changed, 222 insertions(+), 0 deletions(-)
Kevin Wolf - Nov. 7, 2011, 3:18 p.m.
Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  block.c     |  220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block.h     |    1 +
>  block_int.h |    1 +
>  3 files changed, 222 insertions(+), 0 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 79e7f09..b2af48f 100644
> --- a/block.c
> +++ b/block.c
> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>                                                 bool is_write);
>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>  
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, int64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>  
> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>  #endif
>  
>  /* throttling disk I/O limits */
> +void bdrv_io_limits_disable(BlockDriverState *bs)
> +{
> +    bs->io_limits_enabled = false;
> +
> +    while (qemu_co_queue_next(&bs->throttled_reqs));
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +        bs->block_timer = NULL;
> +    }
> +
> +    bs->slice_start = 0;
> +    bs->slice_end   = 0;
> +    bs->slice_time  = 0;
> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
> +}
> +
>  static void bdrv_block_timer(void *opaque)
>  {
>      BlockDriverState *bs = opaque;
> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>  }
>  
> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
> +                                     bool is_write, int nb_sectors)
> +{
> +    int64_t wait_time = -1;
> +
> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
> +        qemu_co_queue_wait(&bs->throttled_reqs);
> +    }
> +
> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
> +     * throttled requests will not be dequeued until the current request is
> +     * allowed to be serviced. So if the current request still exceeds the
> +     * limits, it will be inserted to the head. All requests followed it will
> +     * be still in throttled_reqs queue.
> +     */
> +
> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
> +        qemu_mod_timer(bs->block_timer,
> +                       wait_time + qemu_get_clock_ns(vm_clock));
> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
> +    }
> +
> +    qemu_co_queue_next(&bs->throttled_reqs);
> +}
> +
>  /* check if the path starts with "<protocol>:" */
>  static int path_has_protocol(const char *path)
>  {
> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>          bdrv_dev_change_media_cb(bs, true);
>      }
>  
> +    /* throttling disk I/O limits */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_enable(bs);
> +    }
> +
>      return 0;
>  
>  unlink_and_fail:
> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>  
>          bdrv_dev_change_media_cb(bs, false);
>      }
> +
> +    /*throttling disk I/O limits*/
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_disable(bs);
> +    }
>  }
>  
>  void bdrv_close_all(void)
> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>          return -EIO;
>      }
>  
> +    /* throttling disk read I/O */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
> +    }
> +
>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>  }
>  
> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>          return -EIO;
>      }
>  
> +    /* throttling disk write I/O */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
> +    }
> +
>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>  
>      if (bs->dirty_bitmap) {
> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>      acb->pool->cancel(acb);
>  }
>  
> +/* block I/O throttling */
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +                 bool is_write, double elapsed_time, uint64_t *wait) {
> +    uint64_t bps_limit = 0;
> +    double   bytes_limit, bytes_base, bytes_res;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.bps[is_write]) {
> +        bps_limit = bs->io_limits.bps[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end - bs->slice_start;
> +    slice_time /= (NANOSECONDS_PER_SECOND);
> +    bytes_limit = bps_limit * slice_time;
> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
> +    }
> +
> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    if (bytes_base + bytes_res <= bytes_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
> +
> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }

I'm not quire sure what bs->slice_end really is and what these
calculations do exactly. Looks like magic. Can you add some comments
that explain why slice_end is increased and how you estimate *wait?

> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +                             double elapsed_time, uint64_t *wait) {

Coding style requires the brace on its own line.

> +    uint64_t iops_limit = 0;
> +    double   ios_limit, ios_base;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.iops[is_write]) {
> +        iops_limit = bs->io_limits.iops[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end - bs->slice_start;
> +    slice_time /= (NANOSECONDS_PER_SECOND);
> +    ios_limit  = iops_limit * slice_time;
> +    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
> +    }
> +
> +    if (ios_base + 1 <= ios_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (ios_base + 1) / iops_limit;
> +    if (wait_time > elapsed_time) {
> +        wait_time = wait_time - elapsed_time;
> +    } else {
> +        wait_time = 0;
> +    }
> +
> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +                           bool is_write, int64_t *wait) {

Same here.

Kevin
Zhiyong Wu - Nov. 8, 2011, 4:34 a.m.
On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>> ---
>>  block.c     |  220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>  block.h     |    1 +
>>  block_int.h |    1 +
>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 79e7f09..b2af48f 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>                                                 bool is_write);
>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, int64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>  #endif
>>
>>  /* throttling disk I/O limits */
>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>> +{
>> +    bs->io_limits_enabled = false;
>> +
>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +        bs->block_timer = NULL;
>> +    }
>> +
>> +    bs->slice_start = 0;
>> +    bs->slice_end   = 0;
>> +    bs->slice_time  = 0;
>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>> +}
>> +
>>  static void bdrv_block_timer(void *opaque)
>>  {
>>      BlockDriverState *bs = opaque;
>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>  }
>>
>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>> +                                     bool is_write, int nb_sectors)
>> +{
>> +    int64_t wait_time = -1;
>> +
>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>> +    }
>> +
>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>> +     * throttled requests will not be dequeued until the current request is
>> +     * allowed to be serviced. So if the current request still exceeds the
>> +     * limits, it will be inserted to the head. All requests followed it will
>> +     * be still in throttled_reqs queue.
>> +     */
>> +
>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>> +        qemu_mod_timer(bs->block_timer,
>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>> +    }
>> +
>> +    qemu_co_queue_next(&bs->throttled_reqs);
>> +}
>> +
>>  /* check if the path starts with "<protocol>:" */
>>  static int path_has_protocol(const char *path)
>>  {
>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>          bdrv_dev_change_media_cb(bs, true);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>
>>          bdrv_dev_change_media_cb(bs, false);
>>      }
>> +
>> +    /*throttling disk I/O limits*/
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_disable(bs);
>> +    }
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>>          return -EIO;
>>      }
>>
>> +    /* throttling disk read I/O */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>> +    }
>> +
>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>  }
>>
>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>>          return -EIO;
>>      }
>>
>> +    /* throttling disk write I/O */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>> +    }
>> +
>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>
>>      if (bs->dirty_bitmap) {
>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>      acb->pool->cancel(acb);
>>  }
>>
>> +/* block I/O throttling */
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>> +    uint64_t bps_limit = 0;
>> +    double   bytes_limit, bytes_base, bytes_res;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.bps[is_write]) {
>> +        bps_limit = bs->io_limits.bps[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end - bs->slice_start;
>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>> +    bytes_limit = bps_limit * slice_time;
>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>> +    }
>> +
>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    if (bytes_base + bytes_res <= bytes_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>> +
>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>
> I'm not quire sure what bs->slice_end really is and what these
> calculations do exactly. Looks like magic. Can you add some comments
> that explain why slice_end is increased?
As you'ver known, when the I/O rate at runtime exceeds the limits,
bs->slice_end need to be extended in order that the current statistic
info can be kept until the timer fire, so it is increased and tuned
based on the result of experimet.

> and how you estimate *wait?
The wait time is calcuated based on the history info of bps and iops.

bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;

1.) bytes_base is the bytes of data which have been read/written; and
it is obtained from the history statistic info.
2.) bytes_res is the remaining bytes of data which need to be read/written.
3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
to calcuated the total time for completing reading/writting all data.

I don't make sure if you understand this.

>
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +                             double elapsed_time, uint64_t *wait) {
>
> Coding style requires the brace on its own line.
>
>> +    uint64_t iops_limit = 0;
>> +    double   ios_limit, ios_base;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.iops[is_write]) {
>> +        iops_limit = bs->io_limits.iops[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end - bs->slice_start;
>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>> +    ios_limit  = iops_limit * slice_time;
>> +    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
>> +    }
>> +
>> +    if (ios_base + 1 <= ios_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (ios_base + 1) / iops_limit;
>> +    if (wait_time > elapsed_time) {
>> +        wait_time = wait_time - elapsed_time;
>> +    } else {
>> +        wait_time = 0;
>> +    }
>> +
>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +                           bool is_write, int64_t *wait) {
>
> Same here.
>
> Kevin
>
Kevin Wolf - Nov. 8, 2011, 8:41 a.m.
Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>>> ---
>>>  block.c     |  220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  block.h     |    1 +
>>>  block_int.h |    1 +
>>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 79e7f09..b2af48f 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>>                                                 bool is_write);
>>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> +        double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, int64_t *wait);
>>> +
>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>>  #endif
>>>
>>>  /* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> +    bs->io_limits_enabled = false;
>>> +
>>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>>> +
>>> +    if (bs->block_timer) {
>>> +        qemu_del_timer(bs->block_timer);
>>> +        qemu_free_timer(bs->block_timer);
>>> +        bs->block_timer = NULL;
>>> +    }
>>> +
>>> +    bs->slice_start = 0;
>>> +    bs->slice_end   = 0;
>>> +    bs->slice_time  = 0;
>>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>>> +}
>>> +
>>>  static void bdrv_block_timer(void *opaque)
>>>  {
>>>      BlockDriverState *bs = opaque;
>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>>  }
>>>
>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>> +                                     bool is_write, int nb_sectors)
>>> +{
>>> +    int64_t wait_time = -1;
>>> +
>>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>>> +    }
>>> +
>>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>>> +     * throttled requests will not be dequeued until the current request is
>>> +     * allowed to be serviced. So if the current request still exceeds the
>>> +     * limits, it will be inserted to the head. All requests followed it will
>>> +     * be still in throttled_reqs queue.
>>> +     */
>>> +
>>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>> +        qemu_mod_timer(bs->block_timer,
>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>> +    }
>>> +
>>> +    qemu_co_queue_next(&bs->throttled_reqs);
>>> +}
>>> +
>>>  /* check if the path starts with "<protocol>:" */
>>>  static int path_has_protocol(const char *path)
>>>  {
>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>          bdrv_dev_change_media_cb(bs, true);
>>>      }
>>>
>>> +    /* throttling disk I/O limits */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_enable(bs);
>>> +    }
>>> +
>>>      return 0;
>>>
>>>  unlink_and_fail:
>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>
>>>          bdrv_dev_change_media_cb(bs, false);
>>>      }
>>> +
>>> +    /*throttling disk I/O limits*/
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_disable(bs);
>>> +    }
>>>  }
>>>
>>>  void bdrv_close_all(void)
>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>>>          return -EIO;
>>>      }
>>>
>>> +    /* throttling disk read I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>>> +    }
>>> +
>>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>>  }
>>>
>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>>>          return -EIO;
>>>      }
>>>
>>> +    /* throttling disk write I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>>> +    }
>>> +
>>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>
>>>      if (bs->dirty_bitmap) {
>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>>      acb->pool->cancel(acb);
>>>  }
>>>
>>> +/* block I/O throttling */
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>>> +    uint64_t bps_limit = 0;
>>> +    double   bytes_limit, bytes_base, bytes_res;
>>> +    double   slice_time, wait_time;
>>> +
>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>> +    } else if (bs->io_limits.bps[is_write]) {
>>> +        bps_limit = bs->io_limits.bps[is_write];
>>> +    } else {
>>> +        if (wait) {
>>> +            *wait = 0;
>>> +        }
>>> +
>>> +        return false;
>>> +    }
>>> +
>>> +    slice_time = bs->slice_end - bs->slice_start;
>>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>>> +    bytes_limit = bps_limit * slice_time;
>>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>> +        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>>> +    }
>>> +
>>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>> +
>>> +    if (bytes_base + bytes_res <= bytes_limit) {
>>> +        if (wait) {
>>> +            *wait = 0;
>>> +        }
>>> +
>>> +        return false;
>>> +    }
>>> +
>>> +    /* Calc approx time to dispatch */
>>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>> +
>>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>> +    if (wait) {
>>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>> +    }
>>
>> I'm not quire sure what bs->slice_end really is and what these
>> calculations do exactly. Looks like magic. Can you add some comments
>> that explain why slice_end is increased?
> As you'ver known, when the I/O rate at runtime exceeds the limits,
> bs->slice_end need to be extended in order that the current statistic
> info can be kept until the timer fire, so it is increased and tuned
> based on the result of experimet.
> 
>> and how you estimate *wait?
> The wait time is calcuated based on the history info of bps and iops.
> 
> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
> 
> 1.) bytes_base is the bytes of data which have been read/written; and
> it is obtained from the history statistic info.
> 2.) bytes_res is the remaining bytes of data which need to be read/written.
> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
> to calcuated the total time for completing reading/writting all data.
> 
> I don't make sure if you understand this.

Yes, I think this makes sense to me.

However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
me. Are they more or less arbitrary values that happen to work well?

Kevin
Zhiyong Wu - Nov. 8, 2011, 8:57 a.m.
On Tue, Nov 8, 2011 at 4:41 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 08.11.2011 05:34, schrieb Zhi Yong Wu:
>> On Mon, Nov 7, 2011 at 11:18 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> Am 03.11.2011 09:57, schrieb Zhi Yong Wu:
>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
>>>> ---
>>>>  block.c     |  220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>>  block.h     |    1 +
>>>>  block_int.h |    1 +
>>>>  3 files changed, 222 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/block.c b/block.c
>>>> index 79e7f09..b2af48f 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -74,6 +74,13 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
>>>>                                                 bool is_write);
>>>>  static void coroutine_fn bdrv_co_do_rw(void *opaque);
>>>>
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>> +        double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, int64_t *wait);
>>>> +
>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>
>>>> @@ -107,6 +114,24 @@ int is_windows_drive(const char *filename)
>>>>  #endif
>>>>
>>>>  /* throttling disk I/O limits */
>>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>>> +{
>>>> +    bs->io_limits_enabled = false;
>>>> +
>>>> +    while (qemu_co_queue_next(&bs->throttled_reqs));
>>>> +
>>>> +    if (bs->block_timer) {
>>>> +        qemu_del_timer(bs->block_timer);
>>>> +        qemu_free_timer(bs->block_timer);
>>>> +        bs->block_timer = NULL;
>>>> +    }
>>>> +
>>>> +    bs->slice_start = 0;
>>>> +    bs->slice_end   = 0;
>>>> +    bs->slice_time  = 0;
>>>> +    memset(&bs->io_base, 0, sizeof(bs->io_base));
>>>> +}
>>>> +
>>>>  static void bdrv_block_timer(void *opaque)
>>>>  {
>>>>      BlockDriverState *bs = opaque;
>>>> @@ -136,6 +161,31 @@ bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>>           || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
>>>>  }
>>>>
>>>> +static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>>> +                                     bool is_write, int nb_sectors)
>>>> +{
>>>> +    int64_t wait_time = -1;
>>>> +
>>>> +    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
>>>> +        qemu_co_queue_wait(&bs->throttled_reqs);
>>>> +    }
>>>> +
>>>> +    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
>>>> +     * throttled requests will not be dequeued until the current request is
>>>> +     * allowed to be serviced. So if the current request still exceeds the
>>>> +     * limits, it will be inserted to the head. All requests followed it will
>>>> +     * be still in throttled_reqs queue.
>>>> +     */
>>>> +
>>>> +    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
>>>> +        qemu_mod_timer(bs->block_timer,
>>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>>> +        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
>>>> +    }
>>>> +
>>>> +    qemu_co_queue_next(&bs->throttled_reqs);
>>>> +}
>>>> +
>>>>  /* check if the path starts with "<protocol>:" */
>>>>  static int path_has_protocol(const char *path)
>>>>  {
>>>> @@ -718,6 +768,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>>          bdrv_dev_change_media_cb(bs, true);
>>>>      }
>>>>
>>>> +    /* throttling disk I/O limits */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_enable(bs);
>>>> +    }
>>>> +
>>>>      return 0;
>>>>
>>>>  unlink_and_fail:
>>>> @@ -753,6 +808,11 @@ void bdrv_close(BlockDriverState *bs)
>>>>
>>>>          bdrv_dev_change_media_cb(bs, false);
>>>>      }
>>>> +
>>>> +    /*throttling disk I/O limits*/
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_disable(bs);
>>>> +    }
>>>>  }
>>>>
>>>>  void bdrv_close_all(void)
>>>> @@ -1291,6 +1351,11 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>>>>          return -EIO;
>>>>      }
>>>>
>>>> +    /* throttling disk read I/O */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_intercept(bs, false, nb_sectors);
>>>> +    }
>>>> +
>>>>      return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
>>>>  }
>>>>
>>>> @@ -1321,6 +1386,11 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>>>>          return -EIO;
>>>>      }
>>>>
>>>> +    /* throttling disk write I/O */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        bdrv_io_limits_intercept(bs, true, nb_sectors);
>>>> +    }
>>>> +
>>>>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>>>>
>>>>      if (bs->dirty_bitmap) {
>>>> @@ -2512,6 +2582,156 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
>>>>      acb->pool->cancel(acb);
>>>>  }
>>>>
>>>> +/* block I/O throttling */
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>>>> +    uint64_t bps_limit = 0;
>>>> +    double   bytes_limit, bytes_base, bytes_res;
>>>> +    double   slice_time, wait_time;
>>>> +
>>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>>>> +    } else if (bs->io_limits.bps[is_write]) {
>>>> +        bps_limit = bs->io_limits.bps[is_write];
>>>> +    } else {
>>>> +        if (wait) {
>>>> +            *wait = 0;
>>>> +        }
>>>> +
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    slice_time = bs->slice_end - bs->slice_start;
>>>> +    slice_time /= (NANOSECONDS_PER_SECOND);
>>>> +    bytes_limit = bps_limit * slice_time;
>>>> +    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
>>>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>>>> +        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
>>>> +    }
>>>> +
>>>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>>> +
>>>> +    if (bytes_base + bytes_res <= bytes_limit) {
>>>> +        if (wait) {
>>>> +            *wait = 0;
>>>> +        }
>>>> +
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    /* Calc approx time to dispatch */
>>>> +    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>>> +
>>>> +    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>>> +    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
>>>> +    if (wait) {
>>>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>>>> +    }
>>>
>>> I'm not quire sure what bs->slice_end really is and what these
>>> calculations do exactly. Looks like magic. Can you add some comments
>>> that explain why slice_end is increased?
>> As you'ver known, when the I/O rate at runtime exceeds the limits,
>> bs->slice_end need to be extended in order that the current statistic
>> info can be kept until the timer fire, so it is increased and tuned
>> based on the result of experimet.
>>
>>> and how you estimate *wait?
>> The wait time is calcuated based on the history info of bps and iops.
>>
>> bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
>>
>> 1.) bytes_base is the bytes of data which have been read/written; and
>> it is obtained from the history statistic info.
>> 2.) bytes_res is the remaining bytes of data which need to be read/written.
>> 3.) (bytes_base + bytes_res) / bps_limit, this expression will be used
>> to calcuated the total time for completing reading/writting all data.
>>
>> I don't make sure if you understand this.
>
> Yes, I think this makes sense to me.
>
> However, I don't understand why things like 10 * BLOCK_IO_SLICE_TIME or
10 * BLOCK_IO_SLICE_TIME is used to translate s value to ns value, and
is actually 1s.
> 3 * BLOCK_IO_SLICE_TIME appear in the code. These numbers are magic for
> me. Are they more or less arbitrary values that happen to work well?
They are used to define the window size of one slice. The slice
determine how close the calcuated runtime rate is to the real runtime
rate. So they are tunable variable.

>
> Kevin
>

Patch

diff --git a/block.c b/block.c
index 79e7f09..b2af48f 100644
--- a/block.c
+++ b/block.c
@@ -74,6 +74,13 @@  static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
                                                bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -107,6 +114,24 @@  int is_windows_drive(const char *filename)
 #endif
 
 /* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    while (qemu_co_queue_next(&bs->throttled_reqs));
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
+
+    bs->slice_start = 0;
+    bs->slice_end   = 0;
+    bs->slice_time  = 0;
+    memset(&bs->io_base, 0, sizeof(bs->io_base));
+}
+
 static void bdrv_block_timer(void *opaque)
 {
     BlockDriverState *bs = opaque;
@@ -136,6 +161,31 @@  bool bdrv_io_limits_enabled(BlockDriverState *bs)
          || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
 }
 
+static void bdrv_io_limits_intercept(BlockDriverState *bs,
+                                     bool is_write, int nb_sectors)
+{
+    int64_t wait_time = -1;
+
+    if (!qemu_co_queue_empty(&bs->throttled_reqs)) {
+        qemu_co_queue_wait(&bs->throttled_reqs);
+    }
+
+    /* In fact, we hope to keep each request's timing, in FIFO mode. The next
+     * throttled requests will not be dequeued until the current request is
+     * allowed to be serviced. So if the current request still exceeds the
+     * limits, it will be inserted to the head. All requests followed it will
+     * be still in throttled_reqs queue.
+     */
+
+    while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) {
+        qemu_mod_timer(bs->block_timer,
+                       wait_time + qemu_get_clock_ns(vm_clock));
+        qemu_co_queue_wait_insert_head(&bs->throttled_reqs);
+    }
+
+    qemu_co_queue_next(&bs->throttled_reqs);
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -718,6 +768,11 @@  int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
         bdrv_dev_change_media_cb(bs, true);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -753,6 +808,11 @@  void bdrv_close(BlockDriverState *bs)
 
         bdrv_dev_change_media_cb(bs, false);
     }
+
+    /*throttling disk I/O limits*/
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_disable(bs);
+    }
 }
 
 void bdrv_close_all(void)
@@ -1291,6 +1351,11 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         return -EIO;
     }
 
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, false, nb_sectors);
+    }
+
     return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 }
 
@@ -1321,6 +1386,11 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         return -EIO;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_intercept(bs, true, nb_sectors);
+    }
+
     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 
     if (bs->dirty_bitmap) {
@@ -2512,6 +2582,156 @@  void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+/* block I/O throttling */
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_base, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_base + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time;
+
+    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_base;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
+    }
+
+    if (ios_base + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_base + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    bs->slice_end += bs->slice_time - 3 * BLOCK_IO_SLICE_TIME;
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait) {
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + bs->slice_time;
+    } else {
+        bs->slice_time  =  5 * BLOCK_IO_SLICE_TIME;
+        bs->slice_start = now;
+        bs->slice_end   = now + bs->slice_time;
+
+        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
+        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
+
+        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
+        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index bc8315d..9b5b35f 100644
--- a/block.h
+++ b/block.h
@@ -91,6 +91,7 @@  void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 /* disk I/O throttling */
 void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
 bool bdrv_io_limits_enabled(BlockDriverState *bs);
 
 void bdrv_init(void);
diff --git a/block_int.h b/block_int.h
index 7315e0d..69418fe 100644
--- a/block_int.h
+++ b/block_int.h
@@ -39,6 +39,7 @@ 
 #define BLOCK_IO_LIMIT_TOTAL    2
 
 #define BLOCK_IO_SLICE_TIME     100000000
+#define NANOSECONDS_PER_SECOND  1000000000.0
 
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"