Patchwork [v5,3/4] block: add block timer and block throttling algorithm

login
register
mail settings
Submitter Zhi Yong Wu
Date Aug. 9, 2011, 4:17 a.m.
Message ID <1312863472-6901-4-git-send-email-wuzhy@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/109126/
State New
Headers show

Comments

Zhi Yong Wu - Aug. 9, 2011, 4:17 a.m.
Note:
      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
      2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 block.c     |  347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 block.h     |    6 +-
 block_int.h |   30 +++++
 3 files changed, 372 insertions(+), 11 deletions(-)
Ram Pai - Aug. 9, 2011, 8:57 a.m.
On Tue, Aug 09, 2011 at 12:17:51PM +0800, Zhi Yong Wu wrote:
> Note:
>       1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>       2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
> 
> For these problems, if you have nice thought, pls let us know.:)
> 
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  block.c     |  347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  block.h     |    6 +-
>  block_int.h |   30 +++++
>  3 files changed, 372 insertions(+), 11 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 24a25d5..8fd6643 100644
> --- a/block.c
> +++ b/block.c
> @@ -29,6 +29,9 @@
>  #include "module.h"
>  #include "qemu-objects.h"
> 
> +#include "qemu-timer.h"
> +#include "block/blk-queue.h"
> +
>  #ifdef CONFIG_BSD
>  #include <sys/types.h>
>  #include <sys/stat.h>
> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>                           const uint8_t *buf, int nb_sectors);
> 
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, uint64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
> 
> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>  }
>  #endif
> 
> +/* throttling disk I/O limits */
> +void bdrv_io_limits_disable(BlockDriverState *bs)
> +{
> +    bs->io_limits_enabled = false;
> +    bs->req_from_queue    = false;
> +
> +    if (bs->block_queue) {
> +        qemu_block_queue_flush(bs->block_queue);
> +        qemu_del_block_queue(bs->block_queue);
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +    }
> +
> +    bs->slice_start[0]   = 0;
> +    bs->slice_start[1]   = 0;
> +
> +    bs->slice_end[0]     = 0;
> +    bs->slice_end[1]     = 0;
> +}
> +
> +static void bdrv_block_timer(void *opaque)
> +{
> +    BlockDriverState *bs = opaque;
> +    BlockQueue *queue = bs->block_queue;
> +
> +    qemu_block_queue_flush(queue);
> +}
> +
> +void bdrv_io_limits_enable(BlockDriverState *bs)
> +{
> +    bs->req_from_queue = false;
> +
> +    bs->block_queue    = qemu_new_block_queue();
> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
> +
> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
 
a minor comment. better to keep the slice_start of the both the READ and WRITE
side the same.  

    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_READ];

saves  a call to qemu_get_clock_ns().

> +
> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;

    bs->slice_end[BLOCK_IO_LIMIT_READ] = bs->slice_start[BLOCK_IO_LIMIT_READ] +
			BLOCK_IO_SLICE_TIME;

saves one more call to qemu_get_clock_ns()

> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;


    bs->slice_end[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_WRITE] +
			BLOCK_IO_SLICE_TIME;

yet another call saving.


> +}
> +
> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
> +{
> +    BlockIOLimit *io_limits = &bs->io_limits;
> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
> +        return false;
> +    }
> +
> +    return true;
> +}

can be optimized to:

	return (io_limits->bps[BLOCK_IO_LIMIT_READ]
		|| io_limits->bps[BLOCK_IO_LIMIT_WRITE]
		|| io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
		|| io_limits->iops[BLOCK_IO_LIMIT_READ]
		|| io_limits->iops[BLOCK_IO_LIMIT_WRITE]
		|| io_limits->iops[BLOCK_IO_LIMIT_TOTAL]);

> +
>  /* check if the path starts with "<protocol>:" */
>  static int path_has_protocol(const char *path)
>  {
> @@ -642,6 +714,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>      }
> 
> +    /* throttling disk I/O limits */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_enable(bs);
> +    }
> +
>      return 0;
> 
>  unlink_and_fail:
> @@ -680,6 +757,16 @@ void bdrv_close(BlockDriverState *bs)
>          if (bs->change_cb)
>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>      }
> +
> +    /* throttling disk I/O limits */
> +    if (bs->block_queue) {
> +        qemu_del_block_queue(bs->block_queue);
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +    }
>  }
> 
>  void bdrv_close_all(void)
> @@ -1312,6 +1399,15 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>      *psecs = bs->secs;
>  }
> 
> +/* throttling disk io limits */
> +void bdrv_set_io_limits(BlockDriverState *bs,
> +                            BlockIOLimit *io_limits)
> +{
> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
> +    bs->io_limits = *io_limits;
> +    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
> +}
> +
>  /* Recognize floppy formats */
>  typedef struct FDFormat {
>      FDriveType drive;
> @@ -1707,6 +1803,16 @@ static void bdrv_print_dict(QObject *obj, void *opaque)
>                              qdict_get_bool(qdict, "ro"),
>                              qdict_get_str(qdict, "drv"),
>                              qdict_get_bool(qdict, "encrypted"));
> +
> +        monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
> +                            " bps_wr=%" PRId64 " iops=%" PRId64
> +                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
> +                            qdict_get_int(qdict, "bps"),
> +                            qdict_get_int(qdict, "bps_rd"),
> +                            qdict_get_int(qdict, "bps_wr"),
> +                            qdict_get_int(qdict, "iops"),
> +                            qdict_get_int(qdict, "iops_rd"),
> +                            qdict_get_int(qdict, "iops_wr"));
>      } else {
>          monitor_printf(mon, " [not inserted]");
>      }
> @@ -1739,10 +1845,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data)
>              QDict *bs_dict = qobject_to_qdict(bs_obj);
> 
>              obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
> -                                     "'encrypted': %i }",
> +                                     "'encrypted': %i, "
> +                                     "'bps': %" PRId64 ","
> +                                     "'bps_rd': %" PRId64 ","
> +                                     "'bps_wr': %" PRId64 ","
> +                                     "'iops': %" PRId64 ","
> +                                     "'iops_rd': %" PRId64 ","
> +                                     "'iops_wr': %" PRId64 "}",
>                                       bs->filename, bs->read_only,
>                                       bs->drv->format_name,
> -                                     bdrv_is_encrypted(bs));
> +                                     bdrv_is_encrypted(bs),
> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_READ],
> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE],
> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_READ],
> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
>              if (bs->backing_file[0] != '\0') {
>                  QDict *qdict = qobject_to_qdict(obj);
>                  qdict_put(qdict, "backing_file",
> @@ -2111,6 +2229,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>      return buf;
>  }
> 
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +                 bool is_write, double elapsed_time, uint64_t *wait) {
> +    uint64_t bps_limit = 0;
> +    double   bytes_limit, bytes_disp, bytes_res;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.bps[is_write]) {
> +        bps_limit = bs->io_limits.bps[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
> +    bytes_limit = bps_limit * slice_time;
> +    bytes_disp  = bs->io_disps.bytes[is_write];
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bytes_disp += bs->io_disps.bytes[!is_write];
> +    }
> +
> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    if (bytes_disp + bytes_res <= bytes_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +                             double elapsed_time, uint64_t *wait) {
> +    uint64_t iops_limit = 0;
> +    double   ios_limit, ios_disp;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.iops[is_write]) {
> +        iops_limit = bs->io_limits.iops[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
> +    ios_limit  = iops_limit * slice_time;
> +    ios_disp   = bs->io_disps.ios[is_write];
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        ios_disp += bs->io_disps.ios[!is_write];
> +    }
> +
> +    if (ios_disp + 1 <= ios_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (ios_disp + 1) / iops_limit;
> +    if (wait_time > elapsed_time) {
> +        wait_time = wait_time - elapsed_time;
> +    } else {
> +        wait_time = 0;
> +    }
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}

bdrv_exceed_bps_limits() and bdrv_exceed_iops_limits() has almost similar logic.
Probably can be abstracted into a single function.

RP
Zhiyong Wu - Aug. 9, 2011, 9:06 a.m.
On Tue, Aug 9, 2011 at 4:57 PM, Ram Pai <linuxram@us.ibm.com> wrote:
> On Tue, Aug 09, 2011 at 12:17:51PM +0800, Zhi Yong Wu wrote:
>> Note:
>>       1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>       2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>>
>> For these problems, if you have nice thought, pls let us know.:)
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> ---
>>  block.c     |  347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  block.h     |    6 +-
>>  block_int.h |   30 +++++
>>  3 files changed, 372 insertions(+), 11 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 24a25d5..8fd6643 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -29,6 +29,9 @@
>>  #include "module.h"
>>  #include "qemu-objects.h"
>>
>> +#include "qemu-timer.h"
>> +#include "block/blk-queue.h"
>> +
>>  #ifdef CONFIG_BSD
>>  #include <sys/types.h>
>>  #include <sys/stat.h>
>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>                           const uint8_t *buf, int nb_sectors);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, uint64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>  }
>>  #endif
>>
>> +/* throttling disk I/O limits */
>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>> +{
>> +    bs->io_limits_enabled = false;
>> +    bs->req_from_queue    = false;
>> +
>> +    if (bs->block_queue) {
>> +        qemu_block_queue_flush(bs->block_queue);
>> +        qemu_del_block_queue(bs->block_queue);
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +    }
>> +
>> +    bs->slice_start[0]   = 0;
>> +    bs->slice_start[1]   = 0;
>> +
>> +    bs->slice_end[0]     = 0;
>> +    bs->slice_end[1]     = 0;
>> +}
>> +
>> +static void bdrv_block_timer(void *opaque)
>> +{
>> +    BlockDriverState *bs = opaque;
>> +    BlockQueue *queue = bs->block_queue;
>> +
>> +    qemu_block_queue_flush(queue);
>> +}
>> +
>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>> +{
>> +    bs->req_from_queue = false;
>> +
>> +    bs->block_queue    = qemu_new_block_queue();
>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>> +
>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>
> a minor comment. better to keep the slice_start of the both the READ and WRITE
> side the same.
>
>    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_READ];
>
> saves  a call to qemu_get_clock_ns().
>
>> +
>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>
>    bs->slice_end[BLOCK_IO_LIMIT_READ] = bs->slice_start[BLOCK_IO_LIMIT_READ] +
>                        BLOCK_IO_SLICE_TIME;
>
> saves one more call to qemu_get_clock_ns()
>
>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>
>
>    bs->slice_end[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_WRITE] +
>                        BLOCK_IO_SLICE_TIME;
>
> yet another call saving.
OK. thanks.
>
>
>> +}
>> +
>> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
>> +{
>> +    BlockIOLimit *io_limits = &bs->io_limits;
>> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
>> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
>> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>
> can be optimized to:
>
>        return (io_limits->bps[BLOCK_IO_LIMIT_READ]
>                || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
>                || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
>                || io_limits->iops[BLOCK_IO_LIMIT_READ]
>                || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
>                || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]);
OK. thanks.
>
>> +
>>  /* check if the path starts with "<protocol>:" */
>>  static int path_has_protocol(const char *path)
>>  {
>> @@ -642,6 +714,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -680,6 +757,16 @@ void bdrv_close(BlockDriverState *bs)
>>          if (bs->change_cb)
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>> +
>> +    /* throttling disk I/O limits */
>> +    if (bs->block_queue) {
>> +        qemu_del_block_queue(bs->block_queue);
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +    }
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -1312,6 +1399,15 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>>      *psecs = bs->secs;
>>  }
>>
>> +/* throttling disk io limits */
>> +void bdrv_set_io_limits(BlockDriverState *bs,
>> +                            BlockIOLimit *io_limits)
>> +{
>> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
>> +    bs->io_limits = *io_limits;
>> +    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
>> +}
>> +
>>  /* Recognize floppy formats */
>>  typedef struct FDFormat {
>>      FDriveType drive;
>> @@ -1707,6 +1803,16 @@ static void bdrv_print_dict(QObject *obj, void *opaque)
>>                              qdict_get_bool(qdict, "ro"),
>>                              qdict_get_str(qdict, "drv"),
>>                              qdict_get_bool(qdict, "encrypted"));
>> +
>> +        monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
>> +                            " bps_wr=%" PRId64 " iops=%" PRId64
>> +                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
>> +                            qdict_get_int(qdict, "bps"),
>> +                            qdict_get_int(qdict, "bps_rd"),
>> +                            qdict_get_int(qdict, "bps_wr"),
>> +                            qdict_get_int(qdict, "iops"),
>> +                            qdict_get_int(qdict, "iops_rd"),
>> +                            qdict_get_int(qdict, "iops_wr"));
>>      } else {
>>          monitor_printf(mon, " [not inserted]");
>>      }
>> @@ -1739,10 +1845,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data)
>>              QDict *bs_dict = qobject_to_qdict(bs_obj);
>>
>>              obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
>> -                                     "'encrypted': %i }",
>> +                                     "'encrypted': %i, "
>> +                                     "'bps': %" PRId64 ","
>> +                                     "'bps_rd': %" PRId64 ","
>> +                                     "'bps_wr': %" PRId64 ","
>> +                                     "'iops': %" PRId64 ","
>> +                                     "'iops_rd': %" PRId64 ","
>> +                                     "'iops_wr': %" PRId64 "}",
>>                                       bs->filename, bs->read_only,
>>                                       bs->drv->format_name,
>> -                                     bdrv_is_encrypted(bs));
>> +                                     bdrv_is_encrypted(bs),
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_READ],
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_READ],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
>>              if (bs->backing_file[0] != '\0') {
>>                  QDict *qdict = qobject_to_qdict(obj);
>>                  qdict_put(qdict, "backing_file",
>> @@ -2111,6 +2229,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>>      return buf;
>>  }
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>> +    uint64_t bps_limit = 0;
>> +    double   bytes_limit, bytes_disp, bytes_res;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.bps[is_write]) {
>> +        bps_limit = bs->io_limits.bps[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +    bytes_limit = bps_limit * slice_time;
>> +    bytes_disp  = bs->io_disps.bytes[is_write];
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bytes_disp += bs->io_disps.bytes[!is_write];
>> +    }
>> +
>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    if (bytes_disp + bytes_res <= bytes_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +                             double elapsed_time, uint64_t *wait) {
>> +    uint64_t iops_limit = 0;
>> +    double   ios_limit, ios_disp;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.iops[is_write]) {
>> +        iops_limit = bs->io_limits.iops[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +    ios_limit  = iops_limit * slice_time;
>> +    ios_disp   = bs->io_disps.ios[is_write];
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        ios_disp += bs->io_disps.ios[!is_write];
>> +    }
>> +
>> +    if (ios_disp + 1 <= ios_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (ios_disp + 1) / iops_limit;
>> +    if (wait_time > elapsed_time) {
>> +        wait_time = wait_time - elapsed_time;
>> +    } else {
>> +        wait_time = 0;
>> +    }
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>
> bdrv_exceed_bps_limits() and bdrv_exceed_iops_limits() has almost similar logic.
> Probably can be abstracted into a single function.
I ever attempted to consolidate them, but found that it will also not
save LOC and cause the bad readability of this code; So i gave up.
>
> RP
>
>
Stefan Hajnoczi - Aug. 9, 2011, 3:19 p.m.
On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
> Note:
>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.

If an I/O request is larger than the limit itself then I think we
should let it through with a warning that the limit is too low.  This
reflects the fact that we don't split I/O requests into smaller
requests and the guest may give us 128 KB (or larger?) requests.  In
practice the lowest feasible limit is probably 256 KB or 512 KB.

> diff --git a/block.c b/block.c
> index 24a25d5..8fd6643 100644
> --- a/block.c
> +++ b/block.c
> @@ -29,6 +29,9 @@
>  #include "module.h"
>  #include "qemu-objects.h"
>
> +#include "qemu-timer.h"
> +#include "block/blk-queue.h"
> +
>  #ifdef CONFIG_BSD
>  #include <sys/types.h>
>  #include <sys/stat.h>
> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>                          const uint8_t *buf, int nb_sectors);
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, uint64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>     QTAILQ_HEAD_INITIALIZER(bdrv_states);
>
> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>  }
>  #endif
>
> +/* throttling disk I/O limits */
> +void bdrv_io_limits_disable(BlockDriverState *bs)
> +{
> +    bs->io_limits_enabled = false;
> +    bs->req_from_queue    = false;
> +
> +    if (bs->block_queue) {
> +        qemu_block_queue_flush(bs->block_queue);
> +        qemu_del_block_queue(bs->block_queue);

When you fix the acb lifecycle in block-queue.c this will no longer be
safe.  Requests that are dispatched still have acbs that belong to
this queue.  It is simplest to keep the queue for the lifetime of the
BlockDriverState - it's just a list so it doesn't take up much memory.

> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);

To prevent double frees:

bs->block_timer = NULL;

> +    }
> +
> +    bs->slice_start[0]   = 0;
> +    bs->slice_start[1]   = 0;
> +
> +    bs->slice_end[0]     = 0;
> +    bs->slice_end[1]     = 0;
> +}
> +
> +static void bdrv_block_timer(void *opaque)
> +{
> +    BlockDriverState *bs = opaque;
> +    BlockQueue *queue = bs->block_queue;
> +
> +    qemu_block_queue_flush(queue);
> +}
> +
> +void bdrv_io_limits_enable(BlockDriverState *bs)
> +{
> +    bs->req_from_queue = false;
> +
> +    bs->block_queue    = qemu_new_block_queue();
> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
> +
> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
> +
> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;

The slice times could just be initialized to 0 here.  When
bdrv_exceed_io_limits() is called it will start a new slice.

> +}
> +
> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
> +{
> +    BlockIOLimit *io_limits = &bs->io_limits;
> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
>  /* check if the path starts with "<protocol>:" */
>  static int path_has_protocol(const char *path)
>  {
> @@ -642,6 +714,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>     }
>
> +    /* throttling disk I/O limits */
> +    if (bs->io_limits_enabled) {
> +        bdrv_io_limits_enable(bs);
> +    }
> +
>     return 0;
>
>  unlink_and_fail:
> @@ -680,6 +757,16 @@ void bdrv_close(BlockDriverState *bs)
>         if (bs->change_cb)
>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>     }
> +
> +    /* throttling disk I/O limits */
> +    if (bs->block_queue) {
> +        qemu_del_block_queue(bs->block_queue);
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +    }
>  }
>
>  void bdrv_close_all(void)
> @@ -1312,6 +1399,15 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>     *psecs = bs->secs;
>  }
>
> +/* throttling disk io limits */
> +void bdrv_set_io_limits(BlockDriverState *bs,
> +                            BlockIOLimit *io_limits)
> +{
> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));

The assignment from *io_limits overwrites all of bs->io_limits, the
memset() can be removed.

> +    bs->io_limits = *io_limits;
> +    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
> +}
> +
>  /* Recognize floppy formats */
>  typedef struct FDFormat {
>     FDriveType drive;
> @@ -1707,6 +1803,16 @@ static void bdrv_print_dict(QObject *obj, void *opaque)
>                             qdict_get_bool(qdict, "ro"),
>                             qdict_get_str(qdict, "drv"),
>                             qdict_get_bool(qdict, "encrypted"));
> +
> +        monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
> +                            " bps_wr=%" PRId64 " iops=%" PRId64
> +                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
> +                            qdict_get_int(qdict, "bps"),
> +                            qdict_get_int(qdict, "bps_rd"),
> +                            qdict_get_int(qdict, "bps_wr"),
> +                            qdict_get_int(qdict, "iops"),
> +                            qdict_get_int(qdict, "iops_rd"),
> +                            qdict_get_int(qdict, "iops_wr"));
>     } else {
>         monitor_printf(mon, " [not inserted]");
>     }
> @@ -1739,10 +1845,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data)
>             QDict *bs_dict = qobject_to_qdict(bs_obj);
>
>             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
> -                                     "'encrypted': %i }",
> +                                     "'encrypted': %i, "
> +                                     "'bps': %" PRId64 ","
> +                                     "'bps_rd': %" PRId64 ","
> +                                     "'bps_wr': %" PRId64 ","
> +                                     "'iops': %" PRId64 ","
> +                                     "'iops_rd': %" PRId64 ","
> +                                     "'iops_wr': %" PRId64 "}",
>                                      bs->filename, bs->read_only,
>                                      bs->drv->format_name,
> -                                     bdrv_is_encrypted(bs));
> +                                     bdrv_is_encrypted(bs),
> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_READ],
> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE],
> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_READ],
> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
>             if (bs->backing_file[0] != '\0') {
>                 QDict *qdict = qobject_to_qdict(obj);
>                 qdict_put(qdict, "backing_file",
> @@ -2111,6 +2229,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>     return buf;
>  }
>
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +                 bool is_write, double elapsed_time, uint64_t *wait) {
> +    uint64_t bps_limit = 0;
> +    double   bytes_limit, bytes_disp, bytes_res;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.bps[is_write]) {
> +        bps_limit = bs->io_limits.bps[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
> +    bytes_limit = bps_limit * slice_time;
> +    bytes_disp  = bs->io_disps.bytes[is_write];
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bytes_disp += bs->io_disps.bytes[!is_write];
> +    }
> +
> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +
> +    if (bytes_disp + bytes_res <= bytes_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +                             double elapsed_time, uint64_t *wait) {
> +    uint64_t iops_limit = 0;
> +    double   ios_limit, ios_disp;
> +    double   slice_time, wait_time;
> +
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.iops[is_write]) {
> +        iops_limit = bs->io_limits.iops[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
> +    ios_limit  = iops_limit * slice_time;
> +    ios_disp   = bs->io_disps.ios[is_write];
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        ios_disp += bs->io_disps.ios[!is_write];
> +    }
> +
> +    if (ios_disp + 1 <= ios_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (ios_disp + 1) / iops_limit;
> +    if (wait_time > elapsed_time) {
> +        wait_time = wait_time - elapsed_time;
> +    } else {
> +        wait_time = 0;
> +    }
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +                           bool is_write, uint64_t *wait) {
> +    int64_t  real_time, real_slice;
> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
> +    double   elapsed_time;
> +    int      bps_ret, iops_ret;
> +
> +    real_time = qemu_get_clock_ns(vm_clock);

Please call it 'now' instead of 'real_time'.  There is a "real-time
clock" called rt_clock and this variable name could be confusing.

> +    real_slice = bs->slice_end[is_write] - bs->slice_start[is_write];
> +    if ((bs->slice_start[is_write] < real_time)
> +        && (bs->slice_end[is_write] > real_time)) {
> +        bs->slice_end[is_write]   = real_time + BLOCK_IO_SLICE_TIME;
> +    } else {
> +        bs->slice_start[is_write]     = real_time;
> +        bs->slice_end[is_write]       = real_time + BLOCK_IO_SLICE_TIME;
> +
> +        bs->io_disps.bytes[is_write]  = 0;
> +        bs->io_disps.bytes[!is_write] = 0;
> +
> +        bs->io_disps.ios[is_write]    = 0;
> +        bs->io_disps.ios[!is_write]   = 0;
> +    }
> +
> +    /* If a limit was exceeded, immediately queue this request */
> +    if (!bs->req_from_queue
> +        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {

bs->req_from_queue seems to be a way to prevent requests
unconditionally being queued during flush.  Please define a BlockQueue
interface instead, something like this:

/* If a limit was exceeded, immediately queue this request */
if (qemu_block_queue_has_pending(bs->block_queue)) {

Then req_from_queue can be hidden inside BlockQueue.  I'd rename it to
bool flushing.  qemu_block_queue_flush() will set it to true at the
start of the function and false at the end of the function.

> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +            if (wait) {
> +                *wait = 0;

No estimated wait time?  That's okay only if we stop setting the timer
whenever a request is queued - more comments about this below.

> +            }
> +
> +            return true;
> +        }
> +    }
> +
> +    elapsed_time  = real_time - bs->slice_start[is_write];
> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);

Why * 10.0?

> +
> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
> +                                      is_write, elapsed_time, &bps_wait);
> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
> +                                      elapsed_time, &iops_wait);
> +    if (bps_ret || iops_ret) {
> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
> +        if (wait) {
> +            *wait = max_wait;
> +        }
> +
> +        real_time = qemu_get_clock_ns(vm_clock);
> +        if (bs->slice_end[is_write] < real_time + max_wait) {
> +            bs->slice_end[is_write] = real_time + max_wait;
> +        }

Why is this necessary?  We have already extended the slice at the top
of the function.

> +
> +        return true;
> +    }
> +
> +    if (wait) {
> +        *wait = 0;
> +    }
> +
> +    return false;
> +}
>
>  /**************************************************************/
>  /* async I/Os */
> @@ -2121,13 +2398,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>  {
>     BlockDriver *drv = bs->drv;
>     BlockDriverAIOCB *ret;
> +    uint64_t wait_time = 0;
>
>     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>
> -    if (!drv)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
> +        if (bs->io_limits_enabled) {
> +            bs->req_from_queue = false;
> +        }
>         return NULL;
> +    }
> +
> +    /* throttling disk read I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
> +                              sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                       wait_time + qemu_get_clock_ns(vm_clock));

If a guest keeps sending I/O requests, say every millisecond, then we
will delay dispatch forever.  In practice we hope the storage
interface (virtio-blk, LSI SCSI, etc) has a resource limit that
prevents this.  But the point remains that this delays requests that
should be dispatched for too long.  Once the timer has been set we
should not set it again.

> +            bs->req_from_queue = false;
> +            return ret;
> +        }
> +    }
>
>     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>                               cb, opaque);
> @@ -2136,6 +2428,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>        /* Update stats even though technically transfer has not happened. */
>        bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>        bs->rd_ops ++;
> +
> +        if (bs->io_limits_enabled) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
> +        }
> +    }
> +
> +    if (bs->io_limits_enabled) {
> +        bs->req_from_queue = false;
>     }
>
>     return ret;
> @@ -2184,15 +2486,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>     BlockDriver *drv = bs->drv;
>     BlockDriverAIOCB *ret;
>     BlockCompleteData *blk_cb_data;
> +    uint64_t wait_time = 0;
>
>     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>
> -    if (!drv)
> -        return NULL;
> -    if (bs->read_only)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bs->read_only
> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
> +        if (bs->io_limits_enabled) {
> +            bs->req_from_queue = false;
> +        }
> +
>         return NULL;
> +    }
>
>     if (bs->dirty_bitmap) {
>         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
> @@ -2201,6 +2506,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>         opaque = blk_cb_data;
>     }
>
> +    /* throttling disk write I/O */
> +    if (bs->io_limits_enabled) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
> +                                  sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                                  wait_time + qemu_get_clock_ns(vm_clock));
> +            bs->req_from_queue = false;
> +            return ret;
> +        }
> +    }
> +
>     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>                                cb, opaque);
>
> @@ -2211,6 +2528,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>             bs->wr_highest_sector = sector_num + nb_sectors - 1;
>         }
> +
> +        if (bs->io_limits_enabled) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
> +        }
> +    }
> +
> +    if (bs->io_limits_enabled) {
> +        bs->req_from_queue = false;
>     }
>
>     return ret;
> diff --git a/block.h b/block.h
> index 859d1d9..3d02902 100644
> --- a/block.h
> +++ b/block.h
> @@ -57,6 +57,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
>  void bdrv_stats_print(Monitor *mon, const QObject *data);
>  void bdrv_info_stats(Monitor *mon, QObject **ret_data);
>
> +/* disk I/O throttling */
> +void bdrv_io_limits_enable(BlockDriverState *bs);
> +void bdrv_io_limits_disable(BlockDriverState *bs);
> +bool bdrv_io_limits_enabled(BlockDriverState *bs);
> +
>  void bdrv_init(void);
>  void bdrv_init_with_whitelist(void);
>  BlockDriver *bdrv_find_protocol(const char *filename);
> @@ -97,7 +102,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
>     const char *backing_file, const char *backing_fmt);
>  void bdrv_register(BlockDriver *bdrv);
>
> -
>  typedef struct BdrvCheckResult {
>     int corruptions;
>     int leaks;
> diff --git a/block_int.h b/block_int.h
> index 1e265d2..a4cd458 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -27,10 +27,17 @@
>  #include "block.h"
>  #include "qemu-option.h"
>  #include "qemu-queue.h"
> +#include "block/blk-queue.h"
>
>  #define BLOCK_FLAG_ENCRYPT     1
>  #define BLOCK_FLAG_COMPAT6     4
>
> +#define BLOCK_IO_LIMIT_READ     0
> +#define BLOCK_IO_LIMIT_WRITE    1
> +#define BLOCK_IO_LIMIT_TOTAL    2
> +
> +#define BLOCK_IO_SLICE_TIME     100000000

Please add a comment indicating the units: /* nanoseconds */

> +
>  #define BLOCK_OPT_SIZE          "size"
>  #define BLOCK_OPT_ENCRYPT       "encryption"
>  #define BLOCK_OPT_COMPAT6       "compat6"
> @@ -46,6 +53,16 @@ typedef struct AIOPool {
>     BlockDriverAIOCB *free_aiocb;
>  } AIOPool;
>
> +typedef struct BlockIOLimit {
> +    uint64_t bps[3];
> +    uint64_t iops[3];
> +} BlockIOLimit;
> +
> +typedef struct BlockIODisp {
> +    uint64_t bytes[2];
> +    uint64_t ios[2];
> +} BlockIODisp;
> +
>  struct BlockDriver {
>     const char *format_name;
>     int instance_size;
> @@ -175,6 +192,16 @@ struct BlockDriverState {
>
>     void *sync_aiocb;
>
> +    /* the time for latest disk I/O */
> +    int64_t slice_start[2];
> +    int64_t slice_end[2];
> +    BlockIOLimit io_limits;
> +    BlockIODisp  io_disps;
> +    BlockQueue   *block_queue;
> +    QEMUTimer    *block_timer;
> +    bool         io_limits_enabled;
> +    bool         req_from_queue;
> +
>     /* I/O stats (display with "info blockstats"). */
>     uint64_t rd_bytes;
>     uint64_t wr_bytes;
> @@ -222,6 +249,9 @@ void qemu_aio_release(void *p);
>
>  void *qemu_blockalign(BlockDriverState *bs, size_t size);
>
> +void bdrv_set_io_limits(BlockDriverState *bs,
> +                            BlockIOLimit *io_limits);
> +
>  #ifdef _WIN32
>  int is_windows_drive(const char *filename);
>  #endif
> --
> 1.7.2.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Zhiyong Wu - Aug. 10, 2011, 6:57 a.m.
On Tue, Aug 9, 2011 at 11:19 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
>> Note:
>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>
> If an I/O request is larger than the limit itself then I think we
> should let it through with a warning that the limit is too low.  This
If it will print a waring, it seems to be not a nice way, the
throtting function will take no effect on this scenario.
> reflects the fact that we don't split I/O requests into smaller
> requests and the guest may give us 128 KB (or larger?) requests.  In
> practice the lowest feasible limit is probably 256 KB or 512 KB.
>
>> diff --git a/block.c b/block.c
>> index 24a25d5..8fd6643 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -29,6 +29,9 @@
>>  #include "module.h"
>>  #include "qemu-objects.h"
>>
>> +#include "qemu-timer.h"
>> +#include "block/blk-queue.h"
>> +
>>  #ifdef CONFIG_BSD
>>  #include <sys/types.h>
>>  #include <sys/stat.h>
>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>                          const uint8_t *buf, int nb_sectors);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, uint64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>     QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>  }
>>  #endif
>>
>> +/* throttling disk I/O limits */
>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>> +{
>> +    bs->io_limits_enabled = false;
>> +    bs->req_from_queue    = false;
>> +
>> +    if (bs->block_queue) {
>> +        qemu_block_queue_flush(bs->block_queue);
>> +        qemu_del_block_queue(bs->block_queue);
>
> When you fix the acb lifecycle in block-queue.c this will no longer be
> safe.  Requests that are dispatched still have acbs that belong to
When the request is dispatched, if it is successfully serviced, our
acb will been removed.
> this queue.  It is simplest to keep the queue for the lifetime of the
> BlockDriverState - it's just a list so it doesn't take up much memory.
I more prefer to removing this queue, even though it is simple and
harmless to let it exist.

>
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>
> To prevent double frees:
>
> bs->block_timer = NULL;
Can qemu_free_timer() not ensure that it is NULL? This is not necessary.:)

>
>> +    }
>> +
>> +    bs->slice_start[0]   = 0;
>> +    bs->slice_start[1]   = 0;
>> +
>> +    bs->slice_end[0]     = 0;
>> +    bs->slice_end[1]     = 0;
>> +}
>> +
>> +static void bdrv_block_timer(void *opaque)
>> +{
>> +    BlockDriverState *bs = opaque;
>> +    BlockQueue *queue = bs->block_queue;
>> +
>> +    qemu_block_queue_flush(queue);
>> +}
>> +
>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>> +{
>> +    bs->req_from_queue = false;
>> +
>> +    bs->block_queue    = qemu_new_block_queue();
>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>> +
>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>> +
>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>
> The slice times could just be initialized to 0 here.  When
> bdrv_exceed_io_limits() is called it will start a new slice.
The result should be same as current method even though they are set to ZERO.
>
>> +}
>> +
>> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
>> +{
>> +    BlockIOLimit *io_limits = &bs->io_limits;
>> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
>> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
>> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>>  /* check if the path starts with "<protocol>:" */
>>  static int path_has_protocol(const char *path)
>>  {
>> @@ -642,6 +714,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>     }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>     return 0;
>>
>>  unlink_and_fail:
>> @@ -680,6 +757,16 @@ void bdrv_close(BlockDriverState *bs)
>>         if (bs->change_cb)
>>             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>     }
>> +
>> +    /* throttling disk I/O limits */
>> +    if (bs->block_queue) {
>> +        qemu_del_block_queue(bs->block_queue);
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +    }
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -1312,6 +1399,15 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>>     *psecs = bs->secs;
>>  }
>>
>> +/* throttling disk io limits */
>> +void bdrv_set_io_limits(BlockDriverState *bs,
>> +                            BlockIOLimit *io_limits)
>> +{
>> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
>
> The assignment from *io_limits overwrites all of bs->io_limits, the
> memset() can be removed.
OK.
>
>> +    bs->io_limits = *io_limits;
>> +    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
>> +}
>> +
>>  /* Recognize floppy formats */
>>  typedef struct FDFormat {
>>     FDriveType drive;
>> @@ -1707,6 +1803,16 @@ static void bdrv_print_dict(QObject *obj, void *opaque)
>>                             qdict_get_bool(qdict, "ro"),
>>                             qdict_get_str(qdict, "drv"),
>>                             qdict_get_bool(qdict, "encrypted"));
>> +
>> +        monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
>> +                            " bps_wr=%" PRId64 " iops=%" PRId64
>> +                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
>> +                            qdict_get_int(qdict, "bps"),
>> +                            qdict_get_int(qdict, "bps_rd"),
>> +                            qdict_get_int(qdict, "bps_wr"),
>> +                            qdict_get_int(qdict, "iops"),
>> +                            qdict_get_int(qdict, "iops_rd"),
>> +                            qdict_get_int(qdict, "iops_wr"));
>>     } else {
>>         monitor_printf(mon, " [not inserted]");
>>     }
>> @@ -1739,10 +1845,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data)
>>             QDict *bs_dict = qobject_to_qdict(bs_obj);
>>
>>             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
>> -                                     "'encrypted': %i }",
>> +                                     "'encrypted': %i, "
>> +                                     "'bps': %" PRId64 ","
>> +                                     "'bps_rd': %" PRId64 ","
>> +                                     "'bps_wr': %" PRId64 ","
>> +                                     "'iops': %" PRId64 ","
>> +                                     "'iops_rd': %" PRId64 ","
>> +                                     "'iops_wr': %" PRId64 "}",
>>                                      bs->filename, bs->read_only,
>>                                      bs->drv->format_name,
>> -                                     bdrv_is_encrypted(bs));
>> +                                     bdrv_is_encrypted(bs),
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_READ],
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_READ],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
>>             if (bs->backing_file[0] != '\0') {
>>                 QDict *qdict = qobject_to_qdict(obj);
>>                 qdict_put(qdict, "backing_file",
>> @@ -2111,6 +2229,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>>     return buf;
>>  }
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>> +    uint64_t bps_limit = 0;
>> +    double   bytes_limit, bytes_disp, bytes_res;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.bps[is_write]) {
>> +        bps_limit = bs->io_limits.bps[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +    bytes_limit = bps_limit * slice_time;
>> +    bytes_disp  = bs->io_disps.bytes[is_write];
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bytes_disp += bs->io_disps.bytes[!is_write];
>> +    }
>> +
>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    if (bytes_disp + bytes_res <= bytes_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +                             double elapsed_time, uint64_t *wait) {
>> +    uint64_t iops_limit = 0;
>> +    double   ios_limit, ios_disp;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.iops[is_write]) {
>> +        iops_limit = bs->io_limits.iops[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +    ios_limit  = iops_limit * slice_time;
>> +    ios_disp   = bs->io_disps.ios[is_write];
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        ios_disp += bs->io_disps.ios[!is_write];
>> +    }
>> +
>> +    if (ios_disp + 1 <= ios_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (ios_disp + 1) / iops_limit;
>> +    if (wait_time > elapsed_time) {
>> +        wait_time = wait_time - elapsed_time;
>> +    } else {
>> +        wait_time = 0;
>> +    }
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +                           bool is_write, uint64_t *wait) {
>> +    int64_t  real_time, real_slice;
>> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
>> +    double   elapsed_time;
>> +    int      bps_ret, iops_ret;
>> +
>> +    real_time = qemu_get_clock_ns(vm_clock);
>
> Please call it 'now' instead of 'real_time'.  There is a "real-time
> clock" called rt_clock and this variable name could be confusing.
OK.
>
>> +    real_slice = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    if ((bs->slice_start[is_write] < real_time)
>> +        && (bs->slice_end[is_write] > real_time)) {
>> +        bs->slice_end[is_write]   = real_time + BLOCK_IO_SLICE_TIME;
>> +    } else {
>> +        bs->slice_start[is_write]     = real_time;
>> +        bs->slice_end[is_write]       = real_time + BLOCK_IO_SLICE_TIME;
>> +
>> +        bs->io_disps.bytes[is_write]  = 0;
>> +        bs->io_disps.bytes[!is_write] = 0;
>> +
>> +        bs->io_disps.ios[is_write]    = 0;
>> +        bs->io_disps.ios[!is_write]   = 0;
>> +    }
>> +
>> +    /* If a limit was exceeded, immediately queue this request */
>> +    if (!bs->req_from_queue
>> +        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
>
> bs->req_from_queue seems to be a way to prevent requests
> unconditionally being queued during flush.  Please define a BlockQueue
> interface instead, something like this:
>
> /* If a limit was exceeded, immediately queue this request */
> if (qemu_block_queue_has_pending(bs->block_queue)) {
>
> Then req_from_queue can be hidden inside BlockQueue.  I'd rename it to
> bool flushing.  qemu_block_queue_flush() will set it to true at the
> start of the function and false at the end of the function.
Good advice.
>
>> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
>> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
>> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +            if (wait) {
>> +                *wait = 0;
>
> No estimated wait time?  That's okay only if we stop setting the timer
> whenever a request is queued - more comments about this below.
Yeah.
>
>> +            }
>> +
>> +            return true;
>> +        }
>> +    }
>> +
>> +    elapsed_time  = real_time - bs->slice_start[is_write];
>> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
>
> Why * 10.0?
elapsed_time currently is in ns, and it need to be translated into a
floating value in minutes.
>
>> +
>> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
>> +                                      is_write, elapsed_time, &bps_wait);
>> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
>> +                                      elapsed_time, &iops_wait);
>> +    if (bps_ret || iops_ret) {
>> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
>> +        if (wait) {
>> +            *wait = max_wait;
>> +        }
>> +
>> +        real_time = qemu_get_clock_ns(vm_clock);
>> +        if (bs->slice_end[is_write] < real_time + max_wait) {
>> +            bs->slice_end[is_write] = real_time + max_wait;
>> +        }
>
> Why is this necessary?  We have already extended the slice at the top
> of the function.
Here it will adjust this end of slice time based on
bdrv_exceed_bps_limits(bdrv_exceed_iops_limits).
This will be more accurate.
>
>> +
>> +        return true;
>> +    }
>> +
>> +    if (wait) {
>> +        *wait = 0;
>> +    }
>> +
>> +    return false;
>> +}
>>
>>  /**************************************************************/
>>  /* async I/Os */
>> @@ -2121,13 +2398,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>  {
>>     BlockDriver *drv = bs->drv;
>>     BlockDriverAIOCB *ret;
>> +    uint64_t wait_time = 0;
>>
>>     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>> +        if (bs->io_limits_enabled) {
>> +            bs->req_from_queue = false;
>> +        }
>>         return NULL;
>> +    }
>> +
>> +    /* throttling disk read I/O */
>> +    if (bs->io_limits_enabled) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>> +                              sector_num, qiov, nb_sectors, cb, opaque);
>> +            qemu_mod_timer(bs->block_timer,
>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>
> If a guest keeps sending I/O requests, say every millisecond, then we
> will delay dispatch forever.  In practice we hope the storage
> interface (virtio-blk, LSI SCSI, etc) has a resource limit that
> prevents this.  But the point remains that this delays requests that
> should be dispatched for too long.  Once the timer has been set we
> should not set it again.
Can you elaborate this?

>
>> +            bs->req_from_queue = false;
>> +            return ret;
>> +        }
>> +    }
>>
>>     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>>                               cb, opaque);
>> @@ -2136,6 +2428,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>        /* Update stats even though technically transfer has not happened. */
>>        bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>        bs->rd_ops ++;
>> +
>> +        if (bs->io_limits_enabled) {
>> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
>> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
>> +        }
>> +    }
>> +
>> +    if (bs->io_limits_enabled) {
>> +        bs->req_from_queue = false;
>>     }
>>
>>     return ret;
>> @@ -2184,15 +2486,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>     BlockDriver *drv = bs->drv;
>>     BlockDriverAIOCB *ret;
>>     BlockCompleteData *blk_cb_data;
>> +    uint64_t wait_time = 0;
>>
>>     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bs->read_only)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bs->read_only
>> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>> +        if (bs->io_limits_enabled) {
>> +            bs->req_from_queue = false;
>> +        }
>> +
>>         return NULL;
>> +    }
>>
>>     if (bs->dirty_bitmap) {
>>         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
>> @@ -2201,6 +2506,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>         opaque = blk_cb_data;
>>     }
>>
>> +    /* throttling disk write I/O */
>> +    if (bs->io_limits_enabled) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
>> +                                  sector_num, qiov, nb_sectors, cb, opaque);
>> +            qemu_mod_timer(bs->block_timer,
>> +                                  wait_time + qemu_get_clock_ns(vm_clock));
>> +            bs->req_from_queue = false;
>> +            return ret;
>> +        }
>> +    }
>> +
>>     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>>                                cb, opaque);
>>
>> @@ -2211,6 +2528,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>>             bs->wr_highest_sector = sector_num + nb_sectors - 1;
>>         }
>> +
>> +        if (bs->io_limits_enabled) {
>> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
>> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
>> +        }
>> +    }
>> +
>> +    if (bs->io_limits_enabled) {
>> +        bs->req_from_queue = false;
>>     }
>>
>>     return ret;
>> diff --git a/block.h b/block.h
>> index 859d1d9..3d02902 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -57,6 +57,11 @@ void bdrv_info(Monitor *mon, QObject **ret_data);
>>  void bdrv_stats_print(Monitor *mon, const QObject *data);
>>  void bdrv_info_stats(Monitor *mon, QObject **ret_data);
>>
>> +/* disk I/O throttling */
>> +void bdrv_io_limits_enable(BlockDriverState *bs);
>> +void bdrv_io_limits_disable(BlockDriverState *bs);
>> +bool bdrv_io_limits_enabled(BlockDriverState *bs);
>> +
>>  void bdrv_init(void);
>>  void bdrv_init_with_whitelist(void);
>>  BlockDriver *bdrv_find_protocol(const char *filename);
>> @@ -97,7 +102,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
>>     const char *backing_file, const char *backing_fmt);
>>  void bdrv_register(BlockDriver *bdrv);
>>
>> -
>>  typedef struct BdrvCheckResult {
>>     int corruptions;
>>     int leaks;
>> diff --git a/block_int.h b/block_int.h
>> index 1e265d2..a4cd458 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -27,10 +27,17 @@
>>  #include "block.h"
>>  #include "qemu-option.h"
>>  #include "qemu-queue.h"
>> +#include "block/blk-queue.h"
>>
>>  #define BLOCK_FLAG_ENCRYPT     1
>>  #define BLOCK_FLAG_COMPAT6     4
>>
>> +#define BLOCK_IO_LIMIT_READ     0
>> +#define BLOCK_IO_LIMIT_WRITE    1
>> +#define BLOCK_IO_LIMIT_TOTAL    2
>> +
>> +#define BLOCK_IO_SLICE_TIME     100000000
>
> Please add a comment indicating the units: /* nanoseconds */
>
>> +
>>  #define BLOCK_OPT_SIZE          "size"
>>  #define BLOCK_OPT_ENCRYPT       "encryption"
>>  #define BLOCK_OPT_COMPAT6       "compat6"
>> @@ -46,6 +53,16 @@ typedef struct AIOPool {
>>     BlockDriverAIOCB *free_aiocb;
>>  } AIOPool;
>>
>> +typedef struct BlockIOLimit {
>> +    uint64_t bps[3];
>> +    uint64_t iops[3];
>> +} BlockIOLimit;
>> +
>> +typedef struct BlockIODisp {
>> +    uint64_t bytes[2];
>> +    uint64_t ios[2];
>> +} BlockIODisp;
>> +
>>  struct BlockDriver {
>>     const char *format_name;
>>     int instance_size;
>> @@ -175,6 +192,16 @@ struct BlockDriverState {
>>
>>     void *sync_aiocb;
>>
>> +    /* the time for latest disk I/O */
>> +    int64_t slice_start[2];
>> +    int64_t slice_end[2];
>> +    BlockIOLimit io_limits;
>> +    BlockIODisp  io_disps;
>> +    BlockQueue   *block_queue;
>> +    QEMUTimer    *block_timer;
>> +    bool         io_limits_enabled;
>> +    bool         req_from_queue;
>> +
>>     /* I/O stats (display with "info blockstats"). */
>>     uint64_t rd_bytes;
>>     uint64_t wr_bytes;
>> @@ -222,6 +249,9 @@ void qemu_aio_release(void *p);
>>
>>  void *qemu_blockalign(BlockDriverState *bs, size_t size);
>>
>> +void bdrv_set_io_limits(BlockDriverState *bs,
>> +                            BlockIOLimit *io_limits);
>> +
>>  #ifdef _WIN32
>>  int is_windows_drive(const char *filename);
>>  #endif
>> --
>> 1.7.2.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>
Stefan Hajnoczi - Aug. 10, 2011, 11 a.m.
On Wed, Aug 10, 2011 at 7:57 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Tue, Aug 9, 2011 at 11:19 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
>>> Note:
>>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>
>> If an I/O request is larger than the limit itself then I think we
>> should let it through with a warning that the limit is too low.  This
> If it will print a waring, it seems to be not a nice way, the
> throtting function will take no effect on this scenario.

Setting the limit below the max request size is a misconfiguration.
It's a problem that the user needs to be aware of and fix, so a
warning is appropriate.  Unfortunately the max request size is not
easy to determine from the block layer and may vary between guest
OSes, that's why we need to do something at runtime.

We cannot leave this case unhandled because it results in the guest
timing out I/O without any information about the problem - that makes
it hard to troubleshoot for the user.  Any other ideas on how to
handle this case?

>>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>>  }
>>>  #endif
>>>
>>> +/* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> +    bs->io_limits_enabled = false;
>>> +    bs->req_from_queue    = false;
>>> +
>>> +    if (bs->block_queue) {
>>> +        qemu_block_queue_flush(bs->block_queue);
>>> +        qemu_del_block_queue(bs->block_queue);
>>
>> When you fix the acb lifecycle in block-queue.c this will no longer be
>> safe.  Requests that are dispatched still have acbs that belong to
> When the request is dispatched, if it is successfully serviced, our
> acb will been removed.

Yes, that is how the patches work right now.  But the acb lifecycle
that I explained in response to the other patch changes this.

>> this queue.  It is simplest to keep the queue for the lifetime of the
>> BlockDriverState - it's just a list so it doesn't take up much memory.
> I more prefer to removing this queue, even though it is simple and
> harmless to let it exist.

If the existing acbs do not touch their BlockQueue after this point in
the code, then it will be safe to delete the queue since it will no
longer be referenced.  If you can guarantee this then it's fine to
keep this code.

>
>>
>>> +    }
>>> +
>>> +    if (bs->block_timer) {
>>> +        qemu_del_timer(bs->block_timer);
>>> +        qemu_free_timer(bs->block_timer);
>>
>> To prevent double frees:
>>
>> bs->block_timer = NULL;
> Can qemu_free_timer() not ensure that it is NULL? This is not necessary.:)

No, qemu_free_timer() is a function.  qemu_free_timer() cannot change
the pointer variable bs->block_timer because in C functions take
arguments by value.

After qemu_free_timer() bs->block_timer will still hold the old
address of the (freed) timer.  Then when bdrv_close() is called it
does qemu_free_timer() again because bs->block_timer is not NULL.  It
is illegal to free() a pointer twice and it can cause a crash.

>
>>
>>> +    }
>>> +
>>> +    bs->slice_start[0]   = 0;
>>> +    bs->slice_start[1]   = 0;
>>> +
>>> +    bs->slice_end[0]     = 0;
>>> +    bs->slice_end[1]     = 0;
>>> +}
>>> +
>>> +static void bdrv_block_timer(void *opaque)
>>> +{
>>> +    BlockDriverState *bs = opaque;
>>> +    BlockQueue *queue = bs->block_queue;
>>> +
>>> +    qemu_block_queue_flush(queue);
>>> +}
>>> +
>>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>>> +{
>>> +    bs->req_from_queue = false;
>>> +
>>> +    bs->block_queue    = qemu_new_block_queue();
>>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>> +
>>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>>> +
>>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>
>> The slice times could just be initialized to 0 here.  When
>> bdrv_exceed_io_limits() is called it will start a new slice.
> The result should be same as current method even though they are set to ZERO.

Yes, the result is the same except we only create new slices in one place.

>>> +            }
>>> +
>>> +            return true;
>>> +        }
>>> +    }
>>> +
>>> +    elapsed_time  = real_time - bs->slice_start[is_write];
>>> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
>>
>> Why * 10.0?
> elapsed_time currently is in ns, and it need to be translated into a
> floating value in minutes.

I think you mean seconds instead of minutes.  Then the conversion has
nothing to do with BLOCK_IO_SLICE_TIME, it's just nanoseconds to
seconds.  It would be clearer to drop BLOCK_IO_SLICE_TIME from the
expression and divide by a new NANOSECONDS_PER_SECOND constant
(1000000000.0).

>>
>>> +
>>> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
>>> +                                      is_write, elapsed_time, &bps_wait);
>>> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
>>> +                                      elapsed_time, &iops_wait);
>>> +    if (bps_ret || iops_ret) {
>>> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
>>> +        if (wait) {
>>> +            *wait = max_wait;
>>> +        }
>>> +
>>> +        real_time = qemu_get_clock_ns(vm_clock);
>>> +        if (bs->slice_end[is_write] < real_time + max_wait) {
>>> +            bs->slice_end[is_write] = real_time + max_wait;
>>> +        }
>>
>> Why is this necessary?  We have already extended the slice at the top
>> of the function.
> Here it will adjust this end of slice time based on
> bdrv_exceed_bps_limits(bdrv_exceed_iops_limits).
> This will be more accurate.

After rereading it I realized that this extends the slice up to the
estimated dispatch time (max_wait) and not just by
BLOCK_IO_SLICE_TIME.  This code now makes sense to me: we're trying to
keep slices around while there are queued requests so that
iops/throughput history is not forgotten when queued requests are
dispatched.

>>
>>> +
>>> +        return true;
>>> +    }
>>> +
>>> +    if (wait) {
>>> +        *wait = 0;
>>> +    }
>>> +
>>> +    return false;
>>> +}
>>>
>>>  /**************************************************************/
>>>  /* async I/Os */
>>> @@ -2121,13 +2398,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>>  {
>>>     BlockDriver *drv = bs->drv;
>>>     BlockDriverAIOCB *ret;
>>> +    uint64_t wait_time = 0;
>>>
>>>     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>>>
>>> -    if (!drv)
>>> -        return NULL;
>>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>>> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>>> +        if (bs->io_limits_enabled) {
>>> +            bs->req_from_queue = false;
>>> +        }
>>>         return NULL;
>>> +    }
>>> +
>>> +    /* throttling disk read I/O */
>>> +    if (bs->io_limits_enabled) {
>>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>>> +                              sector_num, qiov, nb_sectors, cb, opaque);
>>> +            qemu_mod_timer(bs->block_timer,
>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>
>> If a guest keeps sending I/O requests, say every millisecond, then we
>> will delay dispatch forever.  In practice we hope the storage
>> interface (virtio-blk, LSI SCSI, etc) has a resource limit that
>> prevents this.  But the point remains that this delays requests that
>> should be dispatched for too long.  Once the timer has been set we
>> should not set it again.
> Can you elaborate this?

Let's wait until the next version of the patch, I've suggested many
changes and should comment against current code instead of the way I
think it is going to work :).

Stefan
Zhiyong Wu - Aug. 12, 2011, 5 a.m.
On Wed, Aug 10, 2011 at 7:00 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Aug 10, 2011 at 7:57 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Tue, Aug 9, 2011 at 11:19 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>> On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
>>>> Note:
>>>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>>
>>> If an I/O request is larger than the limit itself then I think we
>>> should let it through with a warning that the limit is too low.  This
>> If it will print a waring, it seems to be not a nice way, the
>> throtting function will take no effect on this scenario.
>
> Setting the limit below the max request size is a misconfiguration.
> It's a problem that the user needs to be aware of and fix, so a
> warning is appropriate.  Unfortunately the max request size is not
> easy to determine from the block layer and may vary between guest
> OSes, that's why we need to do something at runtime.
>
> We cannot leave this case unhandled because it results in the guest
> timing out I/O without any information about the problem - that makes
> it hard to troubleshoot for the user.  Any other ideas on how to
> handle this case?
Can we constrain the io limits specified by the user? When the limits
is smaller than some value such as 1MB/s, one error is provide to the
user and remind he/her that the limits is too small.

>
>>>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>>>  }
>>>>  #endif
>>>>
>>>> +/* throttling disk I/O limits */
>>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>>> +{
>>>> +    bs->io_limits_enabled = false;
>>>> +    bs->req_from_queue    = false;
>>>> +
>>>> +    if (bs->block_queue) {
>>>> +        qemu_block_queue_flush(bs->block_queue);
>>>> +        qemu_del_block_queue(bs->block_queue);
>>>
>>> When you fix the acb lifecycle in block-queue.c this will no longer be
>>> safe.  Requests that are dispatched still have acbs that belong to
>> When the request is dispatched, if it is successfully serviced, our
>> acb will been removed.
>
> Yes, that is how the patches work right now.  But the acb lifecycle
> that I explained in response to the other patch changes this.
OK.
>
>>> this queue.  It is simplest to keep the queue for the lifetime of the
>>> BlockDriverState - it's just a list so it doesn't take up much memory.
>> I more prefer to removing this queue, even though it is simple and
>> harmless to let it exist.
>
> If the existing acbs do not touch their BlockQueue after this point in
> the code, then it will be safe to delete the queue since it will no
> longer be referenced.  If you can guarantee this then it's fine to
> keep this code.
I prefer to removing the queue.
>
>>
>>>
>>>> +    }
>>>> +
>>>> +    if (bs->block_timer) {
>>>> +        qemu_del_timer(bs->block_timer);
>>>> +        qemu_free_timer(bs->block_timer);
>>>
>>> To prevent double frees:
>>>
>>> bs->block_timer = NULL;
>> Can qemu_free_timer() not ensure that it is NULL? This is not necessary.:)
>
> No, qemu_free_timer() is a function.  qemu_free_timer() cannot change
> the pointer variable bs->block_timer because in C functions take
> arguments by value.
>
> After qemu_free_timer() bs->block_timer will still hold the old
> address of the (freed) timer.  Then when bdrv_close() is called it
> does qemu_free_timer() again because bs->block_timer is not NULL.  It
> is illegal to free() a pointer twice and it can cause a crash.
Done.
>
>>
>>>
>>>> +    }
>>>> +
>>>> +    bs->slice_start[0]   = 0;
>>>> +    bs->slice_start[1]   = 0;
>>>> +
>>>> +    bs->slice_end[0]     = 0;
>>>> +    bs->slice_end[1]     = 0;
>>>> +}
>>>> +
>>>> +static void bdrv_block_timer(void *opaque)
>>>> +{
>>>> +    BlockDriverState *bs = opaque;
>>>> +    BlockQueue *queue = bs->block_queue;
>>>> +
>>>> +    qemu_block_queue_flush(queue);
>>>> +}
>>>> +
>>>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>>>> +{
>>>> +    bs->req_from_queue = false;
>>>> +
>>>> +    bs->block_queue    = qemu_new_block_queue();
>>>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>> +
>>>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>>>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>>>> +
>>>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>>
>>> The slice times could just be initialized to 0 here.  When
>>> bdrv_exceed_io_limits() is called it will start a new slice.
>> The result should be same as current method even though they are set to ZERO.
>
> Yes, the result is the same except we only create new slices in one place.
Done.
>
>>>> +            }
>>>> +
>>>> +            return true;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    elapsed_time  = real_time - bs->slice_start[is_write];
>>>> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
>>>
>>> Why * 10.0?
>> elapsed_time currently is in ns, and it need to be translated into a
>> floating value in minutes.
>
> I think you mean seconds instead of minutes.  Then the conversion has
> nothing to do with BLOCK_IO_SLICE_TIME, it's just nanoseconds to
> seconds.  It would be clearer to drop BLOCK_IO_SLICE_TIME from the
> expression and divide by a new NANOSECONDS_PER_SECOND constant
> (1000000000.0).
Done
>
>>>
>>>> +
>>>> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
>>>> +                                      is_write, elapsed_time, &bps_wait);
>>>> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
>>>> +                                      elapsed_time, &iops_wait);
>>>> +    if (bps_ret || iops_ret) {
>>>> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
>>>> +        if (wait) {
>>>> +            *wait = max_wait;
>>>> +        }
>>>> +
>>>> +        real_time = qemu_get_clock_ns(vm_clock);
>>>> +        if (bs->slice_end[is_write] < real_time + max_wait) {
>>>> +            bs->slice_end[is_write] = real_time + max_wait;
>>>> +        }
>>>
>>> Why is this necessary?  We have already extended the slice at the top
>>> of the function.
>> Here it will adjust this end of slice time based on
>> bdrv_exceed_bps_limits(bdrv_exceed_iops_limits).
>> This will be more accurate.
>
> After rereading it I realized that this extends the slice up to the
> estimated dispatch time (max_wait) and not just by
> BLOCK_IO_SLICE_TIME.  This code now makes sense to me: we're trying to
> keep slices around while there are queued requests so that
> iops/throughput history is not forgotten when queued requests are
> dispatched.
Right.
>
>>>
>>>> +
>>>> +        return true;
>>>> +    }
>>>> +
>>>> +    if (wait) {
>>>> +        *wait = 0;
>>>> +    }
>>>> +
>>>> +    return false;
>>>> +}
>>>>
>>>>  /**************************************************************/
>>>>  /* async I/Os */
>>>> @@ -2121,13 +2398,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>>>  {
>>>>     BlockDriver *drv = bs->drv;
>>>>     BlockDriverAIOCB *ret;
>>>> +    uint64_t wait_time = 0;
>>>>
>>>>     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>>>>
>>>> -    if (!drv)
>>>> -        return NULL;
>>>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>>>> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>>>> +        if (bs->io_limits_enabled) {
>>>> +            bs->req_from_queue = false;
>>>> +        }
>>>>         return NULL;
>>>> +    }
>>>> +
>>>> +    /* throttling disk read I/O */
>>>> +    if (bs->io_limits_enabled) {
>>>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>>>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>>>> +                              sector_num, qiov, nb_sectors, cb, opaque);
>>>> +            qemu_mod_timer(bs->block_timer,
>>>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>>>
>>> If a guest keeps sending I/O requests, say every millisecond, then we
>>> will delay dispatch forever.  In practice we hope the storage
>>> interface (virtio-blk, LSI SCSI, etc) has a resource limit that
>>> prevents this.  But the point remains that this delays requests that
>>> should be dispatched for too long.  Once the timer has been set we
>>> should not set it again.
>> Can you elaborate this?
>
> Let's wait until the next version of the patch, I've suggested many
> changes and should comment against current code instead of the way I
> think it is going to work :).
OK.
>
> Stefan
>
Stefan Hajnoczi - Aug. 12, 2011, 5:06 a.m.
On Fri, Aug 12, 2011 at 6:00 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Wed, Aug 10, 2011 at 7:00 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Wed, Aug 10, 2011 at 7:57 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>>> On Tue, Aug 9, 2011 at 11:19 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>>> On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
>>>>> Note:
>>>>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>>>
>>>> If an I/O request is larger than the limit itself then I think we
>>>> should let it through with a warning that the limit is too low.  This
>>> If it will print a waring, it seems to be not a nice way, the
>>> throtting function will take no effect on this scenario.
>>
>> Setting the limit below the max request size is a misconfiguration.
>> It's a problem that the user needs to be aware of and fix, so a
>> warning is appropriate.  Unfortunately the max request size is not
>> easy to determine from the block layer and may vary between guest
>> OSes, that's why we need to do something at runtime.
>>
>> We cannot leave this case unhandled because it results in the guest
>> timing out I/O without any information about the problem - that makes
>> it hard to troubleshoot for the user.  Any other ideas on how to
>> handle this case?
> Can we constrain the io limits specified by the user? When the limits
> is smaller than some value such as 1MB/s, one error is provide to the
> user and remind he/her that the limits is too small.

It would be an arbitrary limit because the maximum request size
depends on the storage interface or on the host block device.  Guest
OSes may limit the max request size further.  There is no single
constant that we can choose ahead of time.  Any constant value risks
not allowing the user to set a small valid limit or being less than
the max request size and therefore causing I/O to stall again.

Stefan
Zhiyong Wu - Aug. 12, 2011, 5:23 a.m.
On Fri, Aug 12, 2011 at 1:06 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Aug 12, 2011 at 6:00 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Wed, Aug 10, 2011 at 7:00 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>> On Wed, Aug 10, 2011 at 7:57 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>>>> On Tue, Aug 9, 2011 at 11:19 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>>>> On Tue, Aug 9, 2011 at 5:17 AM, Zhi Yong Wu <wuzhy@linux.vnet.ibm.com> wrote:
>>>>>> Note:
>>>>>>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>>>>
>>>>> If an I/O request is larger than the limit itself then I think we
>>>>> should let it through with a warning that the limit is too low.  This
>>>> If it will print a waring, it seems to be not a nice way, the
>>>> throtting function will take no effect on this scenario.
>>>
>>> Setting the limit below the max request size is a misconfiguration.
>>> It's a problem that the user needs to be aware of and fix, so a
>>> warning is appropriate.  Unfortunately the max request size is not
>>> easy to determine from the block layer and may vary between guest
>>> OSes, that's why we need to do something at runtime.
>>>
>>> We cannot leave this case unhandled because it results in the guest
>>> timing out I/O without any information about the problem - that makes
>>> it hard to troubleshoot for the user.  Any other ideas on how to
>>> handle this case?
>> Can we constrain the io limits specified by the user? When the limits
>> is smaller than some value such as 1MB/s, one error is provide to the
>> user and remind he/her that the limits is too small.
>
> It would be an arbitrary limit because the maximum request size
> depends on the storage interface or on the host block device.  Guest
> OSes may limit the max request size further.  There is no single
> constant that we can choose ahead of time.  Any constant value risks
> not allowing the user to set a small valid limit or being less than
> the max request size and therefore causing I/O to stall again.
Let us think at first. When anyone of us has some better idea, we will
rediscuss this.:)
>
> Stefan
>
Zhiyong Wu - Aug. 12, 2011, 5:35 a.m.
On Tue, Aug 9, 2011 at 4:57 PM, Ram Pai <linuxram@us.ibm.com> wrote:
> On Tue, Aug 09, 2011 at 12:17:51PM +0800, Zhi Yong Wu wrote:
>> Note:
>>       1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>       2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>>
>> For these problems, if you have nice thought, pls let us know.:)
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> ---
>>  block.c     |  347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  block.h     |    6 +-
>>  block_int.h |   30 +++++
>>  3 files changed, 372 insertions(+), 11 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 24a25d5..8fd6643 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -29,6 +29,9 @@
>>  #include "module.h"
>>  #include "qemu-objects.h"
>>
>> +#include "qemu-timer.h"
>> +#include "block/blk-queue.h"
>> +
>>  #ifdef CONFIG_BSD
>>  #include <sys/types.h>
>>  #include <sys/stat.h>
>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>                           const uint8_t *buf, int nb_sectors);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, uint64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>  }
>>  #endif
>>
>> +/* throttling disk I/O limits */
>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>> +{
>> +    bs->io_limits_enabled = false;
>> +    bs->req_from_queue    = false;
>> +
>> +    if (bs->block_queue) {
>> +        qemu_block_queue_flush(bs->block_queue);
>> +        qemu_del_block_queue(bs->block_queue);
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +    }
>> +
>> +    bs->slice_start[0]   = 0;
>> +    bs->slice_start[1]   = 0;
>> +
>> +    bs->slice_end[0]     = 0;
>> +    bs->slice_end[1]     = 0;
>> +}
>> +
>> +static void bdrv_block_timer(void *opaque)
>> +{
>> +    BlockDriverState *bs = opaque;
>> +    BlockQueue *queue = bs->block_queue;
>> +
>> +    qemu_block_queue_flush(queue);
>> +}
>> +
>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>> +{
>> +    bs->req_from_queue = false;
>> +
>> +    bs->block_queue    = qemu_new_block_queue();
>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>> +
>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>
> a minor comment. better to keep the slice_start of the both the READ and WRITE
> side the same.
>
>    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_READ];
>
> saves  a call to qemu_get_clock_ns().
>
>> +
>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>
>    bs->slice_end[BLOCK_IO_LIMIT_READ] = bs->slice_start[BLOCK_IO_LIMIT_READ] +
>                        BLOCK_IO_SLICE_TIME;
>
> saves one more call to qemu_get_clock_ns()
>
>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>
>
>    bs->slice_end[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_WRITE] +
>                        BLOCK_IO_SLICE_TIME;
>
> yet another call saving.
>
>
>> +}
>> +
>> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
>> +{
>> +    BlockIOLimit *io_limits = &bs->io_limits;
>> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
>> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
>> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
>> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
>> +        return false;
>> +    }
>> +
>> +    return true;
>> +}
>
> can be optimized to:
>
>        return (io_limits->bps[BLOCK_IO_LIMIT_READ]
>                || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
>                || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
>                || io_limits->iops[BLOCK_IO_LIMIT_READ]
>                || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
>                || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]);
I want to apply this, but it violate qemu coding styles.
>
>> +
>>  /* check if the path starts with "<protocol>:" */
>>  static int path_has_protocol(const char *path)
>>  {
>> @@ -642,6 +714,11 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bs->io_limits_enabled) {
>> +        bdrv_io_limits_enable(bs);
>> +    }
>> +
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -680,6 +757,16 @@ void bdrv_close(BlockDriverState *bs)
>>          if (bs->change_cb)
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>> +
>> +    /* throttling disk I/O limits */
>> +    if (bs->block_queue) {
>> +        qemu_del_block_queue(bs->block_queue);
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +    }
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -1312,6 +1399,15 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>>      *psecs = bs->secs;
>>  }
>>
>> +/* throttling disk io limits */
>> +void bdrv_set_io_limits(BlockDriverState *bs,
>> +                            BlockIOLimit *io_limits)
>> +{
>> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
>> +    bs->io_limits = *io_limits;
>> +    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
>> +}
>> +
>>  /* Recognize floppy formats */
>>  typedef struct FDFormat {
>>      FDriveType drive;
>> @@ -1707,6 +1803,16 @@ static void bdrv_print_dict(QObject *obj, void *opaque)
>>                              qdict_get_bool(qdict, "ro"),
>>                              qdict_get_str(qdict, "drv"),
>>                              qdict_get_bool(qdict, "encrypted"));
>> +
>> +        monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
>> +                            " bps_wr=%" PRId64 " iops=%" PRId64
>> +                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
>> +                            qdict_get_int(qdict, "bps"),
>> +                            qdict_get_int(qdict, "bps_rd"),
>> +                            qdict_get_int(qdict, "bps_wr"),
>> +                            qdict_get_int(qdict, "iops"),
>> +                            qdict_get_int(qdict, "iops_rd"),
>> +                            qdict_get_int(qdict, "iops_wr"));
>>      } else {
>>          monitor_printf(mon, " [not inserted]");
>>      }
>> @@ -1739,10 +1845,22 @@ void bdrv_info(Monitor *mon, QObject **ret_data)
>>              QDict *bs_dict = qobject_to_qdict(bs_obj);
>>
>>              obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
>> -                                     "'encrypted': %i }",
>> +                                     "'encrypted': %i, "
>> +                                     "'bps': %" PRId64 ","
>> +                                     "'bps_rd': %" PRId64 ","
>> +                                     "'bps_wr': %" PRId64 ","
>> +                                     "'iops': %" PRId64 ","
>> +                                     "'iops_rd': %" PRId64 ","
>> +                                     "'iops_wr': %" PRId64 "}",
>>                                       bs->filename, bs->read_only,
>>                                       bs->drv->format_name,
>> -                                     bdrv_is_encrypted(bs));
>> +                                     bdrv_is_encrypted(bs),
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_READ],
>> +                                     bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_READ],
>> +                                     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
>>              if (bs->backing_file[0] != '\0') {
>>                  QDict *qdict = qobject_to_qdict(obj);
>>                  qdict_put(qdict, "backing_file",
>> @@ -2111,6 +2229,165 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>>      return buf;
>>  }
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>> +    uint64_t bps_limit = 0;
>> +    double   bytes_limit, bytes_disp, bytes_res;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.bps[is_write]) {
>> +        bps_limit = bs->io_limits.bps[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +    bytes_limit = bps_limit * slice_time;
>> +    bytes_disp  = bs->io_disps.bytes[is_write];
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bytes_disp += bs->io_disps.bytes[!is_write];
>> +    }
>> +
>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    if (bytes_disp + bytes_res <= bytes_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +                             double elapsed_time, uint64_t *wait) {
>> +    uint64_t iops_limit = 0;
>> +    double   ios_limit, ios_disp;
>> +    double   slice_time, wait_time;
>> +
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.iops[is_write]) {
>> +        iops_limit = bs->io_limits.iops[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
>> +    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +    ios_limit  = iops_limit * slice_time;
>> +    ios_disp   = bs->io_disps.ios[is_write];
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        ios_disp += bs->io_disps.ios[!is_write];
>> +    }
>> +
>> +    if (ios_disp + 1 <= ios_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (ios_disp + 1) / iops_limit;
>> +    if (wait_time > elapsed_time) {
>> +        wait_time = wait_time - elapsed_time;
>> +    } else {
>> +        wait_time = 0;
>> +    }
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
>> +    }
>> +
>> +    return true;
>> +}
>
> bdrv_exceed_bps_limits() and bdrv_exceed_iops_limits() has almost similar logic.
> Probably can be abstracted into a single function.
>
> RP
>
>
Stefan Hajnoczi - Aug. 12, 2011, 5:47 a.m.
On Fri, Aug 12, 2011 at 6:35 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Tue, Aug 9, 2011 at 4:57 PM, Ram Pai <linuxram@us.ibm.com> wrote:
>> On Tue, Aug 09, 2011 at 12:17:51PM +0800, Zhi Yong Wu wrote:
>>> Note:
>>>       1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>>       2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>>>
>>> For these problems, if you have nice thought, pls let us know.:)
>>>
>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>> ---
>>>  block.c     |  347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>  block.h     |    6 +-
>>>  block_int.h |   30 +++++
>>>  3 files changed, 372 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 24a25d5..8fd6643 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -29,6 +29,9 @@
>>>  #include "module.h"
>>>  #include "qemu-objects.h"
>>>
>>> +#include "qemu-timer.h"
>>> +#include "block/blk-queue.h"
>>> +
>>>  #ifdef CONFIG_BSD
>>>  #include <sys/types.h>
>>>  #include <sys/stat.h>
>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>                           const uint8_t *buf, int nb_sectors);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> +        double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, uint64_t *wait);
>>> +
>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>>  }
>>>  #endif
>>>
>>> +/* throttling disk I/O limits */
>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>> +{
>>> +    bs->io_limits_enabled = false;
>>> +    bs->req_from_queue    = false;
>>> +
>>> +    if (bs->block_queue) {
>>> +        qemu_block_queue_flush(bs->block_queue);
>>> +        qemu_del_block_queue(bs->block_queue);
>>> +    }
>>> +
>>> +    if (bs->block_timer) {
>>> +        qemu_del_timer(bs->block_timer);
>>> +        qemu_free_timer(bs->block_timer);
>>> +    }
>>> +
>>> +    bs->slice_start[0]   = 0;
>>> +    bs->slice_start[1]   = 0;
>>> +
>>> +    bs->slice_end[0]     = 0;
>>> +    bs->slice_end[1]     = 0;
>>> +}
>>> +
>>> +static void bdrv_block_timer(void *opaque)
>>> +{
>>> +    BlockDriverState *bs = opaque;
>>> +    BlockQueue *queue = bs->block_queue;
>>> +
>>> +    qemu_block_queue_flush(queue);
>>> +}
>>> +
>>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>>> +{
>>> +    bs->req_from_queue = false;
>>> +
>>> +    bs->block_queue    = qemu_new_block_queue();
>>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>> +
>>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>>
>> a minor comment. better to keep the slice_start of the both the READ and WRITE
>> side the same.
>>
>>    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_READ];
>>
>> saves  a call to qemu_get_clock_ns().
>>
>>> +
>>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>
>>    bs->slice_end[BLOCK_IO_LIMIT_READ] = bs->slice_start[BLOCK_IO_LIMIT_READ] +
>>                        BLOCK_IO_SLICE_TIME;
>>
>> saves one more call to qemu_get_clock_ns()
>>
>>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>
>>
>>    bs->slice_end[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_WRITE] +
>>                        BLOCK_IO_SLICE_TIME;
>>
>> yet another call saving.
>>
>>
>>> +}
>>> +
>>> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>> +{
>>> +    BlockIOLimit *io_limits = &bs->io_limits;
>>> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
>>> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
>>> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
>>> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
>>> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
>>> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
>>> +        return false;
>>> +    }
>>> +
>>> +    return true;
>>> +}
>>
>> can be optimized to:
>>
>>        return (io_limits->bps[BLOCK_IO_LIMIT_READ]
>>                || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
>>                || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
>>                || io_limits->iops[BLOCK_IO_LIMIT_READ]
>>                || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
>>                || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]);
> I want to apply this, but it violate qemu coding styles.

Perhaps checkpatch.pl complains because of the (...) around the return
value.  Try removing them.

Stefan
Zhiyong Wu - Aug. 12, 2011, 6 a.m.
On Fri, Aug 12, 2011 at 1:47 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Aug 12, 2011 at 6:35 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Tue, Aug 9, 2011 at 4:57 PM, Ram Pai <linuxram@us.ibm.com> wrote:
>>> On Tue, Aug 09, 2011 at 12:17:51PM +0800, Zhi Yong Wu wrote:
>>>> Note:
>>>>       1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>>>>       2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.
>>>>
>>>> For these problems, if you have nice thought, pls let us know.:)
>>>>
>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>> ---
>>>>  block.c     |  347 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>  block.h     |    6 +-
>>>>  block_int.h |   30 +++++
>>>>  3 files changed, 372 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/block.c b/block.c
>>>> index 24a25d5..8fd6643 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -29,6 +29,9 @@
>>>>  #include "module.h"
>>>>  #include "qemu-objects.h"
>>>>
>>>> +#include "qemu-timer.h"
>>>> +#include "block/blk-queue.h"
>>>> +
>>>>  #ifdef CONFIG_BSD
>>>>  #include <sys/types.h>
>>>>  #include <sys/stat.h>
>>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>>                           const uint8_t *buf, int nb_sectors);
>>>>
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>> +        double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, uint64_t *wait);
>>>> +
>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>
>>>> @@ -90,6 +100,68 @@ int is_windows_drive(const char *filename)
>>>>  }
>>>>  #endif
>>>>
>>>> +/* throttling disk I/O limits */
>>>> +void bdrv_io_limits_disable(BlockDriverState *bs)
>>>> +{
>>>> +    bs->io_limits_enabled = false;
>>>> +    bs->req_from_queue    = false;
>>>> +
>>>> +    if (bs->block_queue) {
>>>> +        qemu_block_queue_flush(bs->block_queue);
>>>> +        qemu_del_block_queue(bs->block_queue);
>>>> +    }
>>>> +
>>>> +    if (bs->block_timer) {
>>>> +        qemu_del_timer(bs->block_timer);
>>>> +        qemu_free_timer(bs->block_timer);
>>>> +    }
>>>> +
>>>> +    bs->slice_start[0]   = 0;
>>>> +    bs->slice_start[1]   = 0;
>>>> +
>>>> +    bs->slice_end[0]     = 0;
>>>> +    bs->slice_end[1]     = 0;
>>>> +}
>>>> +
>>>> +static void bdrv_block_timer(void *opaque)
>>>> +{
>>>> +    BlockDriverState *bs = opaque;
>>>> +    BlockQueue *queue = bs->block_queue;
>>>> +
>>>> +    qemu_block_queue_flush(queue);
>>>> +}
>>>> +
>>>> +void bdrv_io_limits_enable(BlockDriverState *bs)
>>>> +{
>>>> +    bs->req_from_queue = false;
>>>> +
>>>> +    bs->block_queue    = qemu_new_block_queue();
>>>> +    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>> +
>>>> +    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
>>>> +    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
>>>
>>> a minor comment. better to keep the slice_start of the both the READ and WRITE
>>> side the same.
>>>
>>>    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_READ];
>>>
>>> saves  a call to qemu_get_clock_ns().
>>>
>>>> +
>>>> +    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
>>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>>
>>>    bs->slice_end[BLOCK_IO_LIMIT_READ] = bs->slice_start[BLOCK_IO_LIMIT_READ] +
>>>                        BLOCK_IO_SLICE_TIME;
>>>
>>> saves one more call to qemu_get_clock_ns()
>>>
>>>> +    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
>>>> +                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
>>>
>>>
>>>    bs->slice_end[BLOCK_IO_LIMIT_WRITE] = bs->slice_start[BLOCK_IO_LIMIT_WRITE] +
>>>                        BLOCK_IO_SLICE_TIME;
>>>
>>> yet another call saving.
>>>
>>>
>>>> +}
>>>> +
>>>> +bool bdrv_io_limits_enabled(BlockDriverState *bs)
>>>> +{
>>>> +    BlockIOLimit *io_limits = &bs->io_limits;
>>>> +    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
>>>> +         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
>>>> +         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
>>>> +         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
>>>> +         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
>>>> +         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    return true;
>>>> +}
>>>
>>> can be optimized to:
>>>
>>>        return (io_limits->bps[BLOCK_IO_LIMIT_READ]
>>>                || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
>>>                || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
>>>                || io_limits->iops[BLOCK_IO_LIMIT_READ]
>>>                || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
>>>                || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]);
>> I want to apply this, but it violate qemu coding styles.
>
> Perhaps checkpatch.pl complains because of the (...) around the return
> value.  Try removing them.
After i removed the parentheses, it can work now. thanks.
>
> Stefan
>

Patch

diff --git a/block.c b/block.c
index 24a25d5..8fd6643 100644
--- a/block.c
+++ b/block.c
@@ -29,6 +29,9 @@ 
 #include "module.h"
 #include "qemu-objects.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -58,6 +61,13 @@  static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, uint64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -90,6 +100,68 @@  int is_windows_drive(const char *filename)
 }
 #endif
 
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+    bs->req_from_queue    = false;
+
+    if (bs->block_queue) {
+        qemu_block_queue_flush(bs->block_queue);
+        qemu_del_block_queue(bs->block_queue);
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+    }
+
+    bs->slice_start[0]   = 0;
+    bs->slice_start[1]   = 0;
+
+    bs->slice_end[0]     = 0;
+    bs->slice_end[1]     = 0;
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    BlockQueue *queue = bs->block_queue;
+
+    qemu_block_queue_flush(queue);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    bs->req_from_queue = false;
+
+    bs->block_queue    = qemu_new_block_queue();
+    bs->block_timer    = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+
+    bs->slice_start[BLOCK_IO_LIMIT_READ]  = qemu_get_clock_ns(vm_clock);
+    bs->slice_start[BLOCK_IO_LIMIT_WRITE] = qemu_get_clock_ns(vm_clock);
+
+    bs->slice_end[BLOCK_IO_LIMIT_READ]    =
+                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
+    bs->slice_end[BLOCK_IO_LIMIT_WRITE]   =
+                      qemu_get_clock_ns(vm_clock) + BLOCK_IO_SLICE_TIME;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+    BlockIOLimit *io_limits = &bs->io_limits;
+    if ((io_limits->bps[BLOCK_IO_LIMIT_READ] == 0)
+         && (io_limits->bps[BLOCK_IO_LIMIT_WRITE] == 0)
+         && (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] == 0)
+         && (io_limits->iops[BLOCK_IO_LIMIT_READ] == 0)
+         && (io_limits->iops[BLOCK_IO_LIMIT_WRITE] == 0)
+         && (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] == 0)) {
+        return false;
+    }
+
+    return true;
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -642,6 +714,11 @@  int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -680,6 +757,16 @@  void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+    }
 }
 
 void bdrv_close_all(void)
@@ -1312,6 +1399,15 @@  void bdrv_get_geometry_hint(BlockDriverState *bs,
     *psecs = bs->secs;
 }
 
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+                            BlockIOLimit *io_limits)
+{
+    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
+    bs->io_limits = *io_limits;
+    bs->io_limits_enabled = bdrv_io_limits_enabled(bs);
+}
+
 /* Recognize floppy formats */
 typedef struct FDFormat {
     FDriveType drive;
@@ -1707,6 +1803,16 @@  static void bdrv_print_dict(QObject *obj, void *opaque)
                             qdict_get_bool(qdict, "ro"),
                             qdict_get_str(qdict, "drv"),
                             qdict_get_bool(qdict, "encrypted"));
+
+        monitor_printf(mon, " bps=%" PRId64 " bps_rd=%" PRId64
+                            " bps_wr=%" PRId64 " iops=%" PRId64
+                            " iops_rd=%" PRId64 " iops_wr=%" PRId64,
+                            qdict_get_int(qdict, "bps"),
+                            qdict_get_int(qdict, "bps_rd"),
+                            qdict_get_int(qdict, "bps_wr"),
+                            qdict_get_int(qdict, "iops"),
+                            qdict_get_int(qdict, "iops_rd"),
+                            qdict_get_int(qdict, "iops_wr"));
     } else {
         monitor_printf(mon, " [not inserted]");
     }
@@ -1739,10 +1845,22 @@  void bdrv_info(Monitor *mon, QObject **ret_data)
             QDict *bs_dict = qobject_to_qdict(bs_obj);
 
             obj = qobject_from_jsonf("{ 'file': %s, 'ro': %i, 'drv': %s, "
-                                     "'encrypted': %i }",
+                                     "'encrypted': %i, "
+                                     "'bps': %" PRId64 ","
+                                     "'bps_rd': %" PRId64 ","
+                                     "'bps_wr': %" PRId64 ","
+                                     "'iops': %" PRId64 ","
+                                     "'iops_rd': %" PRId64 ","
+                                     "'iops_wr': %" PRId64 "}",
                                      bs->filename, bs->read_only,
                                      bs->drv->format_name,
-                                     bdrv_is_encrypted(bs));
+                                     bdrv_is_encrypted(bs),
+                                     bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL],
+                                     bs->io_limits.bps[BLOCK_IO_LIMIT_READ],
+                                     bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE],
+                                     bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL],
+                                     bs->io_limits.iops[BLOCK_IO_LIMIT_READ],
+                                     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]);
             if (bs->backing_file[0] != '\0') {
                 QDict *qdict = qobject_to_qdict(obj);
                 qdict_put(qdict, "backing_file",
@@ -2111,6 +2229,165 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
     return buf;
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
+    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
+    bytes_limit = bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end[is_write] - bs->slice_start[is_write];
+    slice_time /= (BLOCK_IO_SLICE_TIME * 10.0);
+    ios_limit  = iops_limit * slice_time;
+    ios_disp   = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, uint64_t *wait) {
+    int64_t  real_time, real_slice;
+    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    real_time = qemu_get_clock_ns(vm_clock);
+    real_slice = bs->slice_end[is_write] - bs->slice_start[is_write];
+    if ((bs->slice_start[is_write] < real_time)
+        && (bs->slice_end[is_write] > real_time)) {
+        bs->slice_end[is_write]   = real_time + BLOCK_IO_SLICE_TIME;
+    } else {
+        bs->slice_start[is_write]     = real_time;
+        bs->slice_end[is_write]       = real_time + BLOCK_IO_SLICE_TIME;
+
+        bs->io_disps.bytes[is_write]  = 0;
+        bs->io_disps.bytes[!is_write] = 0;
+
+        bs->io_disps.ios[is_write]    = 0;
+        bs->io_disps.ios[!is_write]   = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if (!bs->req_from_queue
+        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = 0;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = real_time - bs->slice_start[is_write];
+    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        real_time = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end[is_write] < real_time + max_wait) {
+            bs->slice_end[is_write] = real_time + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async I/Os */
@@ -2121,13 +2398,28 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
 {
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
+    uint64_t wait_time = 0;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
+        if (bs->io_limits_enabled) {
+            bs->req_from_queue = false;
+        }
         return NULL;
+    }
+
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                              sector_num, qiov, nb_sectors, cb, opaque);
+            qemu_mod_timer(bs->block_timer,
+                       wait_time + qemu_get_clock_ns(vm_clock));
+            bs->req_from_queue = false;
+            return ret;
+        }
+    }
 
     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                               cb, opaque);
@@ -2136,6 +2428,16 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
 	/* Update stats even though technically transfer has not happened. */
 	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
 	bs->rd_ops ++;
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    if (bs->io_limits_enabled) {
+        bs->req_from_queue = false;
     }
 
     return ret;
@@ -2184,15 +2486,18 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    uint64_t wait_time = 0;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
+        if (bs->io_limits_enabled) {
+            bs->req_from_queue = false;
+        }
+
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2201,6 +2506,18 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            qemu_mod_timer(bs->block_timer,
+                                  wait_time + qemu_get_clock_ns(vm_clock));
+            bs->req_from_queue = false;
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
 
@@ -2211,6 +2528,16 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
+    }
+
+    if (bs->io_limits_enabled) {
+        bs->req_from_queue = false;
     }
 
     return ret;
diff --git a/block.h b/block.h
index 859d1d9..3d02902 100644
--- a/block.h
+++ b/block.h
@@ -57,6 +57,11 @@  void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 BlockDriver *bdrv_find_protocol(const char *filename);
@@ -97,7 +102,6 @@  int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
diff --git a/block_int.h b/block_int.h
index 1e265d2..a4cd458 100644
--- a/block_int.h
+++ b/block_int.h
@@ -27,10 +27,17 @@ 
 #include "block.h"
 #include "qemu-option.h"
 #include "qemu-queue.h"
+#include "block/blk-queue.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
 
+#define BLOCK_IO_LIMIT_READ     0
+#define BLOCK_IO_LIMIT_WRITE    1
+#define BLOCK_IO_LIMIT_TOTAL    2
+
+#define BLOCK_IO_SLICE_TIME     100000000
+
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
 #define BLOCK_OPT_COMPAT6       "compat6"
@@ -46,6 +53,16 @@  typedef struct AIOPool {
     BlockDriverAIOCB *free_aiocb;
 } AIOPool;
 
+typedef struct BlockIOLimit {
+    uint64_t bps[3];
+    uint64_t iops[3];
+} BlockIOLimit;
+
+typedef struct BlockIODisp {
+    uint64_t bytes[2];
+    uint64_t ios[2];
+} BlockIODisp;
+
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -175,6 +192,16 @@  struct BlockDriverState {
 
     void *sync_aiocb;
 
+    /* the time for latest disk I/O */
+    int64_t slice_start[2];
+    int64_t slice_end[2];
+    BlockIOLimit io_limits;
+    BlockIODisp  io_disps;
+    BlockQueue   *block_queue;
+    QEMUTimer    *block_timer;
+    bool         io_limits_enabled;
+    bool         req_from_queue;
+
     /* I/O stats (display with "info blockstats"). */
     uint64_t rd_bytes;
     uint64_t wr_bytes;
@@ -222,6 +249,9 @@  void qemu_aio_release(void *p);
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 
+void bdrv_set_io_limits(BlockDriverState *bs,
+                            BlockIOLimit *io_limits);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif