Patchwork [v2,1/1] The codes V2 for QEMU disk I/O limits.

login
register
mail settings
Submitter Zhi Yong Wu
Date July 26, 2011, 8:59 a.m.
Message ID <1311670746-20498-2-git-send-email-wuzhy@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/106802/
State New
Headers show

Comments

Zhi Yong Wu - July 26, 2011, 8:59 a.m.
Welcome to give me your comments, thanks.

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 Makefile.objs     |    2 +-
 block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 block.h           |    1 -
 block/blk-queue.c |  116 +++++++++++++++++++++
 block/blk-queue.h |   70 +++++++++++++
 block_int.h       |   28 +++++
 blockdev.c        |   21 ++++
 qemu-config.c     |   24 +++++
 qemu-option.c     |   17 +++
 qemu-option.h     |    1 +
 qemu-options.hx   |    1 +
 11 files changed, 559 insertions(+), 10 deletions(-)
 create mode 100644 block/blk-queue.c
 create mode 100644 block/blk-queue.h
Marcelo Tosatti - July 26, 2011, 7:26 p.m.
On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
> Welcome to give me your comments, thanks.
> 
> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> ---
>  Makefile.objs     |    2 +-
>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  block.h           |    1 -
>  block/blk-queue.c |  116 +++++++++++++++++++++
>  block/blk-queue.h |   70 +++++++++++++
>  block_int.h       |   28 +++++
>  blockdev.c        |   21 ++++
>  qemu-config.c     |   24 +++++
>  qemu-option.c     |   17 +++
>  qemu-option.h     |    1 +
>  qemu-options.hx   |    1 +
>  11 files changed, 559 insertions(+), 10 deletions(-)
>  create mode 100644 block/blk-queue.c
>  create mode 100644 block/blk-queue.h
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 9f99ed4..06f2033 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_CURL) += curl.o
> diff --git a/block.c b/block.c
> index 24a25d5..e54e59c 100644
> --- a/block.c
> +++ b/block.c
> @@ -29,6 +29,9 @@
>  #include "module.h"
>  #include "qemu-objects.h"
>  
> +#include "qemu-timer.h"
> +#include "block/blk-queue.h"
> +
>  #ifdef CONFIG_BSD
>  #include <sys/types.h>
>  #include <sys/stat.h>
> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>                           const uint8_t *buf, int nb_sectors);
>  
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +        double elapsed_time, uint64_t *wait);
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +        bool is_write, uint64_t *wait);
> +
>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>  
> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>  }
>  #endif
>  
> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
> +{
> +    if ((io_limits->bps[0] == 0)
> +         && (io_limits->bps[1] == 0)
> +         && (io_limits->bps[2] == 0)
> +         && (io_limits->iops[0] == 0)
> +         && (io_limits->iops[1] == 0)
> +         && (io_limits->iops[2] == 0)) {
> +        return 0;
> +    }
> +
> +    return 1;
> +}
> +
>  /* check if the path starts with "<protocol>:" */
>  static int path_has_protocol(const char *path)
>  {
> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>      }
>  }
>  
> +static void bdrv_block_timer(void *opaque)
> +{
> +    BlockDriverState *bs = opaque;
> +    BlockQueue *queue = bs->block_queue;
> +
> +    while (!QTAILQ_EMPTY(&queue->requests)) {
> +        BlockIORequest *request;
> +        int ret;
> +
> +        request = QTAILQ_FIRST(&queue->requests);
> +        QTAILQ_REMOVE(&queue->requests, request, entry);
> +
> +        ret = qemu_block_queue_handler(request);
> +        if (ret == 0) {
> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
> +            break;
> +        }
> +
> +        qemu_free(request);
> +    }
> +}
> +
>  void bdrv_register(BlockDriver *bdrv)
>  {
>      if (!bdrv->bdrv_aio_readv) {
> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>      }
>  
> +    /* throttling disk I/O limits */
> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> +        bs->block_queue = qemu_new_block_queue();
> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
> +
> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
> +    }
> +

It should be possible to tune the limits on the flight, please introduce
QMP commands for that.

>      return 0;
>  
>  unlink_and_fail:
> @@ -680,6 +735,16 @@ void bdrv_close(BlockDriverState *bs)
>          if (bs->change_cb)
>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>      }
> +
> +    /* throttling disk I/O limits */
> +    if (bs->block_queue) {
> +        qemu_del_block_queue(bs->block_queue);
> +    }
> +
> +    if (bs->block_timer) {
> +        qemu_del_timer(bs->block_timer);
> +        qemu_free_timer(bs->block_timer);
> +    }
>  }
>  
>  void bdrv_close_all(void)
> @@ -1312,6 +1377,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>      *psecs = bs->secs;
>  }
>  
> +/* throttling disk io limits */
> +void bdrv_set_io_limits(BlockDriverState *bs,
> +			    BlockIOLimit *io_limits)
> +{
> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
> +    bs->io_limits = *io_limits;
> +}
> +
>  /* Recognize floppy formats */
>  typedef struct FDFormat {
>      FDriveType drive;
> @@ -2111,6 +2184,155 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>      return buf;
>  }
>  
> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
> +                 bool is_write, double elapsed_time, uint64_t *wait) {
> +    uint64_t bps_limit = 0;
> +    double   bytes_limit, bytes_disp, bytes_res;
> +    double   slice_time = 0.1, wait_time;
> +
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.bps[is_write]) {
> +        bps_limit = bs->io_limits.bps[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    bytes_limit	= bps_limit * slice_time;
> +    bytes_disp  = bs->io_disps.bytes[is_write];
> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> +        bytes_disp += bs->io_disps.bytes[!is_write];
> +    }
> +
> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;

Virtio can submit requests of 512 sectors or more... does not play
well with 1MB/sec limit.

> +    if (bytes_disp + bytes_res <= bytes_limit) {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (bytes_disp + bytes_res - bytes_limit) / bps_limit;
> +    if (!wait_time) {
> +        wait_time = 1;
> +    }
> +
> +    wait_time = wait_time + (slice_time - elapsed_time);
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
> +    }

The guest can keep submitting requests where "wait_time = 1" above, 
and the timer will be rearmed continuously in the future. Can't you
simply arm the timer to the next slice start? _Some_ data must be
transfered by then, anyway (and nothing can be transfered earlier than
that).

Same for iops calculation below.

> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> +                             double elapsed_time, uint64_t *wait) {
> +    uint64_t iops_limit = 0;
> +    double   ios_limit, ios_disp;
> +    double   slice_time = 0.1, wait_time;
> +
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> +    } else if (bs->io_limits.iops[is_write]) {
> +        iops_limit = bs->io_limits.iops[is_write];
> +    } else {
> +        if (wait) {
> +            *wait = 0;
> +        }
> +
> +        return false;
> +    }
> +
> +    ios_limit = iops_limit * slice_time;
> +    ios_disp  = bs->io_disps.ios[is_write];
> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +        ios_disp += bs->io_disps.ios[!is_write];
> +    }
> +
> +    if (ios_disp + 1 <= ios_limit) {
> +	if (wait) {
> +	    *wait = 0;
> +	}
> +
> +        return false;
> +    }
> +
> +    /* Calc approx time to dispatch */
> +    wait_time = (ios_disp + 1) / iops_limit;
> +    if (wait_time > elapsed_time) {
> +	wait_time = wait_time - elapsed_time;
> +    } else {
> +	wait_time = 0;
> +    }
> +
> +    if (wait) {
> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
> +    }
> +
> +    return true;
> +}
> +
> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> +                           bool is_write, uint64_t *wait) {
> +    int64_t  real_time;
> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
> +    double   elapsed_time;
> +    int      bps_ret, iops_ret;
> +
> +    real_time = qemu_get_clock_ns(vm_clock);
> +    if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) {
> +        bs->slice_start[is_write] = real_time;
> +
> +        bs->io_disps.bytes[is_write]   = 0;
> +        bs->io_disps.bytes[!is_write]  = 0;
> +
> +        bs->io_disps.ios[is_write]     = 0;
> +        bs->io_disps.ios[!is_write]    = 0;
> +    }
> +
> +    /* If a limit was exceeded, immediately queue this request */
> +    if ((bs->req_from_queue == false)
> +        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> +            if (wait) {
> +                *wait = 0;
> +            }
> +
> +            return true;
> +        }
> +    }
> +
> +    elapsed_time  = real_time - bs->slice_start[is_write];
> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
> +
> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
> +                                      is_write, elapsed_time, &bps_wait);
> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
> +                                      elapsed_time, &iops_wait);
> +    if (bps_ret || iops_ret) {
> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
> +        if (wait) {
> +            *wait = max_wait;
> +        }
> +
> +        return true;
> +    }
> +
> +    if (wait) {
> +        *wait = 0;
> +    }
> +
> +    return false;
> +}
>  
>  /**************************************************************/
>  /* async I/Os */
> @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>  {
>      BlockDriver *drv = bs->drv;
>      BlockDriverAIOCB *ret;
> +    uint64_t wait_time = 0;
>  
>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>  
> -    if (!drv)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> +            bs->req_from_queue = false;
> +        }
>          return NULL;
> +    }
> +
> +    /* throttling disk read I/O */
> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
> +                              sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                       wait_time + qemu_get_clock_ns(vm_clock));
> +            bs->req_from_queue = false;
> +            return ret;
> +        }
> +    }
>  
>      ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>                                cb, opaque);
> @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>  	/* Update stats even though technically transfer has not happened. */
>  	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>  	bs->rd_ops ++;
> +
> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
> +        }
> +    }
> +
> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> +        bs->req_from_queue = false;
>      }
>  
>      return ret;
> @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>      BlockDriver *drv = bs->drv;
>      BlockDriverAIOCB *ret;
>      BlockCompleteData *blk_cb_data;
> +    uint64_t wait_time = 0;
>  
>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>  
> -    if (!drv)
> -        return NULL;
> -    if (bs->read_only)
> -        return NULL;
> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> +    if (!drv || bs->read_only
> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> +            bs->req_from_queue = false;
> +        }
> +
>          return NULL;
> +    }
>  
>      if (bs->dirty_bitmap) {
>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
> @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>          opaque = blk_cb_data;
>      }
>  
> +    /* throttling disk write I/O */
> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
> +                                  sector_num, qiov, nb_sectors, cb, opaque);
> +            qemu_mod_timer(bs->block_timer,
> +                                  wait_time + qemu_get_clock_ns(vm_clock));
> +            bs->req_from_queue = false;
> +            return ret;
> +        }
> +    }
> +
>      ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>                                 cb, opaque);
>  
> @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>          if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>              bs->wr_highest_sector = sector_num + nb_sectors - 1;
>          }
> +
> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
> +        }
> +    }
> +
> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> +        bs->req_from_queue = false;
>      }
>  
>      return ret;
> diff --git a/block.h b/block.h
> index 859d1d9..f0dac62 100644
> --- a/block.h
> +++ b/block.h
> @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
>      const char *backing_file, const char *backing_fmt);
>  void bdrv_register(BlockDriver *bdrv);
>  
> -
>  typedef struct BdrvCheckResult {
>      int corruptions;
>      int leaks;
> diff --git a/block/blk-queue.c b/block/blk-queue.c
> new file mode 100644
> index 0000000..09fcfe9
> --- /dev/null
> +++ b/block/blk-queue.c
> @@ -0,0 +1,116 @@
> +/*
> + * QEMU System Emulator queue definition for block layer
> + *
> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "block_int.h"
> +#include "qemu-queue.h"
> +#include "block/blk-queue.h"
> +
> +/* The APIs for block request queue on qemu block layer.
> + */
> +
> +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb)
> +{
> +    qemu_aio_release(acb);
> +}
> +
> +static AIOPool block_queue_pool = {
> +    .aiocb_size         = sizeof(struct BlockDriverAIOCB),
> +    .cancel             = qemu_block_queue_cancel,
> +};
> +
> +static void qemu_block_queue_callback(void *opaque, int ret)
> +{
> +    BlockDriverAIOCB *acb = opaque;
> +
> +    qemu_aio_release(acb);
> +}
> +
> +BlockQueue *qemu_new_block_queue(void)
> +{
> +    BlockQueue *queue;
> +
> +    queue = qemu_mallocz(sizeof(BlockQueue));
> +
> +    QTAILQ_INIT(&queue->requests);
> +
> +    return queue;
> +}
> +
> +void qemu_del_block_queue(BlockQueue *queue)
> +{
> +    BlockIORequest *request, *next;
> +
> +    QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) {
> +        QTAILQ_REMOVE(&queue->requests, request, entry);
> +        qemu_free(request);
> +    }
> +
> +    qemu_free(queue);
> +}
> +
> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
> +			BlockDriverState *bs,
> +			BlockRequestHandler *handler,
> +			int64_t sector_num,
> +			QEMUIOVector *qiov,
> +			int nb_sectors,
> +			BlockDriverCompletionFunc *cb,
> +			void *opaque)
> +{
> +    BlockIORequest *request;
> +    BlockDriverAIOCB *acb;
> +
> +    request = qemu_malloc(sizeof(BlockIORequest));
> +    request->bs = bs;
> +    request->handler = handler;
> +    request->sector_num = sector_num;
> +    request->qiov = qiov;
> +    request->nb_sectors = nb_sectors;
> +    request->cb = cb;
> +    request->opaque = opaque;
> +
> +    QTAILQ_INSERT_TAIL(&queue->requests, request, entry);
> +
> +    acb = qemu_aio_get(&block_queue_pool, bs,
> +                       qemu_block_queue_callback, opaque);
> +
> +    return acb;
> +}
> +
> +int qemu_block_queue_handler(BlockIORequest *request)
> +{
> +    int ret;
> +    BlockDriverAIOCB *res;
> +
> +    /* indicate this req is from block queue */
> +    request->bs->req_from_queue = true;
> +
> +    res = request->handler(request->bs, request->sector_num,
> +        	                request->qiov, request->nb_sectors,
> +				request->cb, request->opaque);
> +
> +    ret = (res == NULL) ? 0 : 1; 
> +
> +    return ret;
> +}
> diff --git a/block/blk-queue.h b/block/blk-queue.h
> new file mode 100644
> index 0000000..47f8a36
> --- /dev/null
> +++ b/block/blk-queue.h
> @@ -0,0 +1,70 @@
> +/*
> + * QEMU System Emulator queue declaration for block layer
> + *
> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#ifndef QEMU_BLOCK_QUEUE_H
> +#define QEMU_BLOCK_QUEUE_H
> +
> +#include "block.h"
> +#include "qemu-queue.h"
> +#include "qemu-common.h"
> +
> +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs,
> +                                int64_t sector_num, QEMUIOVector *qiov,
> +                                int nb_sectors, BlockDriverCompletionFunc *cb,
> +                                void *opaque);
> +
> +struct BlockIORequest {
> +    QTAILQ_ENTRY(BlockIORequest) entry;
> +    BlockDriverState *bs;
> +    BlockRequestHandler *handler;
> +    int64_t sector_num;
> +    QEMUIOVector *qiov;
> +    int nb_sectors;
> +    BlockDriverCompletionFunc *cb;
> +    void *opaque;
> +};
> +
> +typedef struct BlockIORequest BlockIORequest;
> +
> +struct BlockQueue {
> +    QTAILQ_HEAD(requests, BlockIORequest) requests;
> +};
> +
> +typedef struct BlockQueue BlockQueue;
> +
> +BlockQueue *qemu_new_block_queue(void);
> +
> +void qemu_del_block_queue(BlockQueue *queue);
> +
> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
> +                        BlockDriverState *bs,
> +                        BlockRequestHandler *handler,
> +                        int64_t sector_num,
> +                        QEMUIOVector *qiov,
> +                        int nb_sectors,
> +                        BlockDriverCompletionFunc *cb,
> +                        void *opaque);
> +
> +int qemu_block_queue_handler(BlockIORequest *request);
> +#endif /* QEMU_BLOCK_QUEUE_H */
> diff --git a/block_int.h b/block_int.h
> index 1e265d2..1587171 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -27,10 +27,17 @@
>  #include "block.h"
>  #include "qemu-option.h"
>  #include "qemu-queue.h"
> +#include "block/blk-queue.h"
>  
>  #define BLOCK_FLAG_ENCRYPT	1
>  #define BLOCK_FLAG_COMPAT6	4
>  
> +#define BLOCK_IO_LIMIT_READ     0
> +#define BLOCK_IO_LIMIT_WRITE    1
> +#define BLOCK_IO_LIMIT_TOTAL    2
> +
> +#define BLOCK_IO_SLICE_TIME     100000000
> +
>  #define BLOCK_OPT_SIZE          "size"
>  #define BLOCK_OPT_ENCRYPT       "encryption"
>  #define BLOCK_OPT_COMPAT6       "compat6"
> @@ -46,6 +53,16 @@ typedef struct AIOPool {
>      BlockDriverAIOCB *free_aiocb;
>  } AIOPool;
>  
> +typedef struct BlockIOLimit {
> +    uint64_t bps[3];
> +    uint64_t iops[3];
> +} BlockIOLimit;
> +
> +typedef struct BlockIODisp {
> +    uint64_t bytes[2];
> +    uint64_t ios[2];
> +} BlockIODisp;
> +
>  struct BlockDriver {
>      const char *format_name;
>      int instance_size;
> @@ -175,6 +192,14 @@ struct BlockDriverState {
>  
>      void *sync_aiocb;
>  
> +    /* the time for latest disk I/O */
> +    int64_t slice_start[2];
> +    BlockIOLimit io_limits;
> +    BlockIODisp  io_disps;
> +    BlockQueue   *block_queue;
> +    QEMUTimer    *block_timer;
> +    bool         req_from_queue;
> +
>      /* I/O stats (display with "info blockstats"). */
>      uint64_t rd_bytes;
>      uint64_t wr_bytes;
> @@ -222,6 +247,9 @@ void qemu_aio_release(void *p);
>  
>  void *qemu_blockalign(BlockDriverState *bs, size_t size);
>  
> +void bdrv_set_io_limits(BlockDriverState *bs,
> +                            BlockIOLimit *io_limits);
> +
>  #ifdef _WIN32
>  int is_windows_drive(const char *filename);
>  #endif
> diff --git a/blockdev.c b/blockdev.c
> index c263663..45602f4 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>      int on_read_error, on_write_error;
>      const char *devaddr;
>      DriveInfo *dinfo;
> +    BlockIOLimit io_limits;
> +    bool iol_flag = false;
> +    const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"};
>      int is_extboot = 0;
>      int snapshot = 0;
>      int ret;
> @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>          return NULL;
>      }
>  
> +    /* disk io limits */
> +    iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts);
> +    if (iol_flag) {
> +        memset(&io_limits, 0, sizeof(BlockIOLimit));
> +
> +        io_limits.bps[2]  = qemu_opt_get_number(opts, "bps", 0);
> +        io_limits.bps[0]  = qemu_opt_get_number(opts, "bps_rd", 0);
> +        io_limits.bps[1]  = qemu_opt_get_number(opts, "bps_wr", 0);
> +        io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0);
> +        io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0);
> +        io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0);
> +    }
> +
>      on_write_error = BLOCK_ERR_STOP_ENOSPC;
>      if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
>          if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
> @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>  
>      bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
>  
> +    /* throttling disk io limits */
> +    if (iol_flag) {
> +        bdrv_set_io_limits(dinfo->bdrv, &io_limits);
> +    }
> +
>      switch(type) {
>      case IF_IDE:
>      case IF_SCSI:
> diff --git a/qemu-config.c b/qemu-config.c
> index efa892c..9232bbb 100644
> --- a/qemu-config.c
> +++ b/qemu-config.c
> @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = {
>              .name = "boot",
>              .type = QEMU_OPT_BOOL,
>              .help = "make this a boot drive",
> +        },{
> +            .name = "iops",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "limit total I/O operations per second",
> +        },{
> +            .name = "iops_rd",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "limit read operations per second",
> +        },{
> +            .name = "iops_wr",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "limit write operations per second",
> +        },{
> +            .name = "bps",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "limit total bytes per second",
> +        },{
> +            .name = "bps_rd",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "limit read bytes per second",
> +        },{
> +            .name = "bps_wr",
> +            .type = QEMU_OPT_NUMBER,
> +            .help = "limit write bytes per second",
>          },
>          { /* end of list */ }
>      },
> diff --git a/qemu-option.c b/qemu-option.c
> index 65db542..9fe234d 100644
> --- a/qemu-option.c
> +++ b/qemu-option.c
> @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
>      return opt->value.uint;
>  }
>  
> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts)
> +{
> +     int i;
> +     uint64_t opt_val   = 0;
> +     bool iol_flag = false;
> +
> +     for (i = 0; iol_opts[i]; i++) {
> +	 opt_val = qemu_opt_get_number(opts, iol_opts[i], 0);
> +	 if (opt_val != 0) {
> +	     iol_flag = true;
> +	     break;
> +	 }
> +     }
> +
> +     return iol_flag;
> +}
> +
>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
>  {
>      QemuOpt *opt = qemu_opt_find(opts, name);
> diff --git a/qemu-option.h b/qemu-option.h
> index b515813..fc909f9 100644
> --- a/qemu-option.h
> +++ b/qemu-option.h
> @@ -107,6 +107,7 @@ struct QemuOptsList {
>  const char *qemu_opt_get(QemuOpts *opts, const char *name);
>  int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval);
>  uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval);
> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts);
>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval);
>  int qemu_opt_set(QemuOpts *opts, const char *name, const char *value);
>  typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque);
> diff --git a/qemu-options.hx b/qemu-options.hx
> index cb3347e..ae219f5 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>      "       [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
>      "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
>      "       [,readonly=on|off][,boot=on|off]\n"
> +    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
>      "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>  STEXI
>  @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
> -- 
> 1.7.2.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yoder Stuart-B08248 - July 26, 2011, 9:51 p.m.
> From: Anthony Liguori <address@hidden>
> 
> GLib is an extremely common library that has a portable thread implementation
> along with tons of other goodies.
> 
> GLib and GObject have a fantastic amount of infrastructure we can leverage in
> QEMU including an object oriented programming infrastructure.
> 
> Short term, it has a very nice thread pool implementation that we could leverage
> in something like virtio-9p.  It also has a test harness implementation that
> this series will use.
> 
> Signed-off-by: Anthony Liguori <address@hidden>
> Signed-off-by: Michael Roth <address@hidden>
> Signed-off-by: Luiz Capitulino <address@hidden>
> ---
>  Makefile        |    2 ++
>  Makefile.objs   |    1 +
>  Makefile.target |    1 +
>  configure       |   13 +++++++++++++
>  4 files changed, 17 insertions(+), 0 deletions(-)
> 
> diff --git a/Makefile b/Makefile
> index b3ffbe2..42ae4e5 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -106,6 +106,8 @@ audio/audio.o audio/fmodaudio.o: QEMU_CFLAGS += 
> $(FMOD_CFLAGS)
>  
>  QEMU_CFLAGS+=$(CURL_CFLAGS)
>  
> +QEMU_CFLAGS+=$(GLIB_CFLAGS)
> +
>  ui/cocoa.o: ui/cocoa.m
>  
>  ui/sdl.o audio/sdlaudio.o ui/sdl_zoom.o baum.o: QEMU_CFLAGS += $(SDL_CFLAGS)
> diff --git a/Makefile.objs b/Makefile.objs
> index c43ed05..55d18bb 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -376,3 +376,4 @@ vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)
>  
>  vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS)
>  
> +vl.o: QEMU_CFLAGS+=$(GLIB_CFLAGS)
> diff --git a/Makefile.target b/Makefile.target
> index e20a313..cde509b 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -204,6 +204,7 @@ QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
>  QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
>  QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
>  QEMU_CFLAGS += $(VNC_PNG_CFLAGS)
> +QEMU_CFLAGS += $(GLIB_CFLAGS)
>  
>  # xen support
>  obj-$(CONFIG_XEN) += xen-all.o xen_machine_pv.o xen_domainbuild.o 
> xen-mapcache.o
> diff --git a/configure b/configure
> index e57efb1..c0c8fdf 100755
> --- a/configure
> +++ b/configure
> @@ -1803,6 +1803,18 @@ EOF
>  fi
>  
>  ##########################################
> +# glib support probe
> +if $pkg_config --modversion gthread-2.0 gio-2.0 > /dev/null 2>&1 ; then
> +    glib_cflags=`$pkg_config --cflags gthread-2.0 gio-2.0 2>/dev/null`
> +    glib_libs=`$pkg_config --libs gthread-2.0 gio-2.0 2>/dev/null`
> +    libs_softmmu="$glib_libs $libs_softmmu"
> +    libs_tools="$glib_libs $libs_tools"
> +else
> +    echo "glib-2.0 required to compile QEMU"
> +    exit 1
> +fi

I am having issues with this in a cross compilation 
environment.   In Power embedded, almost all our
development is using cross toolchains.

Neither glib or pkg-config are in our cross build environment
and I'm having issues getting them built and installed.
Not even sure if pkg-config is even supposed to work
in a cross development environment...I'm new to that
tool and poking around a bit with google raises
some questions.

Wanted to make you aware of the issue...

Stuart
Anthony Liguori - July 26, 2011, 10:09 p.m.
On 07/26/2011 04:51 PM, Yoder Stuart-B08248 wrote:
>
> I am having issues with this in a cross compilation
> environment.   In Power embedded, almost all our
> development is using cross toolchains.
>
> Neither glib or pkg-config are in our cross build environment
> and I'm having issues getting them built and installed.
> Not even sure if pkg-config is even supposed to work
> in a cross development environment...I'm new to that
> tool and poking around a bit with google raises
> some questions.

You're probably setting up your cross environment incorrectly which, 
unfortunately, is very common.

The proper thing to do is to have GCC use a different system include 
directory and a different prefix.  That will result in a directory where 
there are gcc binaries with normal names installed in ${cross_prefix}/bin

You need to build and install pkg-config to this prefix too, and then 
when it comes time to actually doing the QEMU configure, you should do 
something like:

export PATH=${cross_prefix}/bin:$PATH
export PKG_CONFIG_PATH=${cross_prefix}/lib/pkg-config:$PKG_CONFIG_PATH

Many automated cross compiler environment scripts will install specially 
named versions of gcc and binutils in your normal $PATH.  The trouble 
is, this is a bit of a hack and unless you know to make this hack work 
with other build tools, it all comes tumbling down.

Regards,

Anthony Liguori

>
> Wanted to make you aware of the issue...
>
> Stuart
>
Benjamin Herrenschmidt - July 27, 2011, 8:54 a.m.
> You're probably setting up your cross environment incorrectly which, 
> unfortunately, is very common.
> 
> The proper thing to do is to have GCC use a different system include 
> directory and a different prefix.  That will result in a directory where 
> there are gcc binaries with normal names installed in ${cross_prefix}/bin
> 
> You need to build and install pkg-config to this prefix too, and then 
> when it comes time to actually doing the QEMU configure, you should do 
> something like:
> 
> export PATH=${cross_prefix}/bin:$PATH
> export PKG_CONFIG_PATH=${cross_prefix}/lib/pkg-config:$PKG_CONFIG_PATH
> 
> Many automated cross compiler environment scripts will install specially 
> named versions of gcc and binutils in your normal $PATH.  The trouble 
> is, this is a bit of a hack and unless you know to make this hack work 
> with other build tools, it all comes tumbling down.

Well, that hard requirement is causing us problem on our 32/64-bit cross
builds as well.

It looks like glib (at least recent versions in -sid) can't be built
64-bit on a 32-bit system :-( At least not without fixing some horrid
bugs in there related to some generated include path from what David
says (I'll let him comment further).

In general, every time you add a library requirement without some config
option to disable it for cases such as ours, you add pain :-)

Now, in the specific case of glib, I understand why you would want to
get rid of re-invented wheels and use it, so I'm not specifically
criticizing that specific change, we'll eventually have to fix it
anyways. Just a heads up to be careful with hard requirements in
general.

Cheers,
Ben.
Zhiyong Wu - July 27, 2011, 10:17 a.m.
On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>> Welcome to give me your comments, thanks.
>>
>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>> ---
>>  Makefile.objs     |    2 +-
>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>  block.h           |    1 -
>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>  block/blk-queue.h |   70 +++++++++++++
>>  block_int.h       |   28 +++++
>>  blockdev.c        |   21 ++++
>>  qemu-config.c     |   24 +++++
>>  qemu-option.c     |   17 +++
>>  qemu-option.h     |    1 +
>>  qemu-options.hx   |    1 +
>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>  create mode 100644 block/blk-queue.c
>>  create mode 100644 block/blk-queue.h
>>
>> diff --git a/Makefile.objs b/Makefile.objs
>> index 9f99ed4..06f2033 100644
>> --- a/Makefile.objs
>> +++ b/Makefile.objs
>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>  block-nested-y += qed-check.o
>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>  block-nested-$(CONFIG_CURL) += curl.o
>> diff --git a/block.c b/block.c
>> index 24a25d5..e54e59c 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -29,6 +29,9 @@
>>  #include "module.h"
>>  #include "qemu-objects.h"
>>
>> +#include "qemu-timer.h"
>> +#include "block/blk-queue.h"
>> +
>>  #ifdef CONFIG_BSD
>>  #include <sys/types.h>
>>  #include <sys/stat.h>
>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>                           const uint8_t *buf, int nb_sectors);
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +        double elapsed_time, uint64_t *wait);
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +        bool is_write, uint64_t *wait);
>> +
>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>
>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>  }
>>  #endif
>>
>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>> +{
>> +    if ((io_limits->bps[0] == 0)
>> +         && (io_limits->bps[1] == 0)
>> +         && (io_limits->bps[2] == 0)
>> +         && (io_limits->iops[0] == 0)
>> +         && (io_limits->iops[1] == 0)
>> +         && (io_limits->iops[2] == 0)) {
>> +        return 0;
>> +    }
>> +
>> +    return 1;
>> +}
>> +
>>  /* check if the path starts with "<protocol>:" */
>>  static int path_has_protocol(const char *path)
>>  {
>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>      }
>>  }
>>
>> +static void bdrv_block_timer(void *opaque)
>> +{
>> +    BlockDriverState *bs = opaque;
>> +    BlockQueue *queue = bs->block_queue;
>> +
>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>> +        BlockIORequest *request;
>> +        int ret;
>> +
>> +        request = QTAILQ_FIRST(&queue->requests);
>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>> +
>> +        ret = qemu_block_queue_handler(request);
>> +        if (ret == 0) {
>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>> +            break;
>> +        }
>> +
>> +        qemu_free(request);
>> +    }
>> +}
>> +
>>  void bdrv_register(BlockDriver *bdrv)
>>  {
>>      if (!bdrv->bdrv_aio_readv) {
>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>>
>> +    /* throttling disk I/O limits */
>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +        bs->block_queue = qemu_new_block_queue();
>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>> +
>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>> +    }
>> +
>
> It should be possible to tune the limits on the flight, please introduce
> QMP commands for that.
Yeah, I am working on this.
>
>>      return 0;
>>
>>  unlink_and_fail:
>> @@ -680,6 +735,16 @@ void bdrv_close(BlockDriverState *bs)
>>          if (bs->change_cb)
>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>      }
>> +
>> +    /* throttling disk I/O limits */
>> +    if (bs->block_queue) {
>> +        qemu_del_block_queue(bs->block_queue);
>> +    }
>> +
>> +    if (bs->block_timer) {
>> +        qemu_del_timer(bs->block_timer);
>> +        qemu_free_timer(bs->block_timer);
>> +    }
>>  }
>>
>>  void bdrv_close_all(void)
>> @@ -1312,6 +1377,14 @@ void bdrv_get_geometry_hint(BlockDriverState *bs,
>>      *psecs = bs->secs;
>>  }
>>
>> +/* throttling disk io limits */
>> +void bdrv_set_io_limits(BlockDriverState *bs,
>> +                         BlockIOLimit *io_limits)
>> +{
>> +    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
>> +    bs->io_limits = *io_limits;
>> +}
>> +
>>  /* Recognize floppy formats */
>>  typedef struct FDFormat {
>>      FDriveType drive;
>> @@ -2111,6 +2184,155 @@ char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
>>      return buf;
>>  }
>>
>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>> +                 bool is_write, double elapsed_time, uint64_t *wait) {
>> +    uint64_t bps_limit = 0;
>> +    double   bytes_limit, bytes_disp, bytes_res;
>> +    double   slice_time = 0.1, wait_time;
>> +
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.bps[is_write]) {
>> +        bps_limit = bs->io_limits.bps[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    bytes_limit      = bps_limit * slice_time;
>> +    bytes_disp  = bs->io_disps.bytes[is_write];
>> +    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
>> +        bytes_disp += bs->io_disps.bytes[!is_write];
>> +    }
>> +
>> +    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>
> Virtio can submit requests of 512 sectors or more... does not play
> well with 1MB/sec limit.
Yeah, thanks for you catch. I will fix this.
>
>> +    if (bytes_disp + bytes_res <= bytes_limit) {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (bytes_disp + bytes_res - bytes_limit) / bps_limit;
>> +    if (!wait_time) {
>> +        wait_time = 1;
>> +    }
>> +
>> +    wait_time = wait_time + (slice_time - elapsed_time);
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
>> +    }
>
> The guest can keep submitting requests where "wait_time = 1" above,
> and the timer will be rearmed continuously in the future. Can't you
> simply arm the timer to the next slice start? _Some_ data must be
> transfered by then, anyway (and nothing can be transfered earlier than
> that).
Sorry, i have got what you mean. Can you elaborate in more detail?

>
> Same for iops calculation below.
>
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> +                             double elapsed_time, uint64_t *wait) {
>> +    uint64_t iops_limit = 0;
>> +    double   ios_limit, ios_disp;
>> +    double   slice_time = 0.1, wait_time;
>> +
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> +    } else if (bs->io_limits.iops[is_write]) {
>> +        iops_limit = bs->io_limits.iops[is_write];
>> +    } else {
>> +        if (wait) {
>> +            *wait = 0;
>> +        }
>> +
>> +        return false;
>> +    }
>> +
>> +    ios_limit = iops_limit * slice_time;
>> +    ios_disp  = bs->io_disps.ios[is_write];
>> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +        ios_disp += bs->io_disps.ios[!is_write];
>> +    }
>> +
>> +    if (ios_disp + 1 <= ios_limit) {
>> +     if (wait) {
>> +         *wait = 0;
>> +     }
>> +
>> +        return false;
>> +    }
>> +
>> +    /* Calc approx time to dispatch */
>> +    wait_time = (ios_disp + 1) / iops_limit;
>> +    if (wait_time > elapsed_time) {
>> +     wait_time = wait_time - elapsed_time;
>> +    } else {
>> +     wait_time = 0;
>> +    }
>> +
>> +    if (wait) {
>> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> +                           bool is_write, uint64_t *wait) {
>> +    int64_t  real_time;
>> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
>> +    double   elapsed_time;
>> +    int      bps_ret, iops_ret;
>> +
>> +    real_time = qemu_get_clock_ns(vm_clock);
>> +    if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) {
>> +        bs->slice_start[is_write] = real_time;
>> +
>> +        bs->io_disps.bytes[is_write]   = 0;
>> +        bs->io_disps.bytes[!is_write]  = 0;
>> +
>> +        bs->io_disps.ios[is_write]     = 0;
>> +        bs->io_disps.ios[!is_write]    = 0;
>> +    }
>> +
>> +    /* If a limit was exceeded, immediately queue this request */
>> +    if ((bs->req_from_queue == false)
>> +        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
>> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
>> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
>> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> +            if (wait) {
>> +                *wait = 0;
>> +            }
>> +
>> +            return true;
>> +        }
>> +    }
>> +
>> +    elapsed_time  = real_time - bs->slice_start[is_write];
>> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
>> +
>> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
>> +                                      is_write, elapsed_time, &bps_wait);
>> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
>> +                                      elapsed_time, &iops_wait);
>> +    if (bps_ret || iops_ret) {
>> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
>> +        if (wait) {
>> +            *wait = max_wait;
>> +        }
>> +
>> +        return true;
>> +    }
>> +
>> +    if (wait) {
>> +        *wait = 0;
>> +    }
>> +
>> +    return false;
>> +}
>>
>>  /**************************************************************/
>>  /* async I/Os */
>> @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>  {
>>      BlockDriver *drv = bs->drv;
>>      BlockDriverAIOCB *ret;
>> +    uint64_t wait_time = 0;
>>
>>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +            bs->req_from_queue = false;
>> +        }
>>          return NULL;
>> +    }
>> +
>> +    /* throttling disk read I/O */
>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>> +                              sector_num, qiov, nb_sectors, cb, opaque);
>> +            qemu_mod_timer(bs->block_timer,
>> +                       wait_time + qemu_get_clock_ns(vm_clock));
>> +            bs->req_from_queue = false;
>> +            return ret;
>> +        }
>> +    }
>>
>>      ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>>                                cb, opaque);
>> @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>>       /* Update stats even though technically transfer has not happened. */
>>       bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>>       bs->rd_ops ++;
>> +
>> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
>> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
>> +        }
>> +    }
>> +
>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +        bs->req_from_queue = false;
>>      }
>>
>>      return ret;
>> @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>      BlockDriver *drv = bs->drv;
>>      BlockDriverAIOCB *ret;
>>      BlockCompleteData *blk_cb_data;
>> +    uint64_t wait_time = 0;
>>
>>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>>
>> -    if (!drv)
>> -        return NULL;
>> -    if (bs->read_only)
>> -        return NULL;
>> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> +    if (!drv || bs->read_only
>> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +            bs->req_from_queue = false;
>> +        }
>> +
>>          return NULL;
>> +    }
>>
>>      if (bs->dirty_bitmap) {
>>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
>> @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>          opaque = blk_cb_data;
>>      }
>>
>> +    /* throttling disk write I/O */
>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
>> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
>> +                                  sector_num, qiov, nb_sectors, cb, opaque);
>> +            qemu_mod_timer(bs->block_timer,
>> +                                  wait_time + qemu_get_clock_ns(vm_clock));
>> +            bs->req_from_queue = false;
>> +            return ret;
>> +        }
>> +    }
>> +
>>      ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>>                                 cb, opaque);
>>
>> @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>>          if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>>              bs->wr_highest_sector = sector_num + nb_sectors - 1;
>>          }
>> +
>> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
>> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
>> +        }
>> +    }
>> +
>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> +        bs->req_from_queue = false;
>>      }
>>
>>      return ret;
>> diff --git a/block.h b/block.h
>> index 859d1d9..f0dac62 100644
>> --- a/block.h
>> +++ b/block.h
>> @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
>>      const char *backing_file, const char *backing_fmt);
>>  void bdrv_register(BlockDriver *bdrv);
>>
>> -
>>  typedef struct BdrvCheckResult {
>>      int corruptions;
>>      int leaks;
>> diff --git a/block/blk-queue.c b/block/blk-queue.c
>> new file mode 100644
>> index 0000000..09fcfe9
>> --- /dev/null
>> +++ b/block/blk-queue.c
>> @@ -0,0 +1,116 @@
>> +/*
>> + * QEMU System Emulator queue definition for block layer
>> + *
>> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> + * of this software and associated documentation files (the "Software"), to deal
>> + * in the Software without restriction, including without limitation the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> + * copies of the Software, and to permit persons to whom the Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#include "block_int.h"
>> +#include "qemu-queue.h"
>> +#include "block/blk-queue.h"
>> +
>> +/* The APIs for block request queue on qemu block layer.
>> + */
>> +
>> +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb)
>> +{
>> +    qemu_aio_release(acb);
>> +}
>> +
>> +static AIOPool block_queue_pool = {
>> +    .aiocb_size         = sizeof(struct BlockDriverAIOCB),
>> +    .cancel             = qemu_block_queue_cancel,
>> +};
>> +
>> +static void qemu_block_queue_callback(void *opaque, int ret)
>> +{
>> +    BlockDriverAIOCB *acb = opaque;
>> +
>> +    qemu_aio_release(acb);
>> +}
>> +
>> +BlockQueue *qemu_new_block_queue(void)
>> +{
>> +    BlockQueue *queue;
>> +
>> +    queue = qemu_mallocz(sizeof(BlockQueue));
>> +
>> +    QTAILQ_INIT(&queue->requests);
>> +
>> +    return queue;
>> +}
>> +
>> +void qemu_del_block_queue(BlockQueue *queue)
>> +{
>> +    BlockIORequest *request, *next;
>> +
>> +    QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) {
>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>> +        qemu_free(request);
>> +    }
>> +
>> +    qemu_free(queue);
>> +}
>> +
>> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
>> +                     BlockDriverState *bs,
>> +                     BlockRequestHandler *handler,
>> +                     int64_t sector_num,
>> +                     QEMUIOVector *qiov,
>> +                     int nb_sectors,
>> +                     BlockDriverCompletionFunc *cb,
>> +                     void *opaque)
>> +{
>> +    BlockIORequest *request;
>> +    BlockDriverAIOCB *acb;
>> +
>> +    request = qemu_malloc(sizeof(BlockIORequest));
>> +    request->bs = bs;
>> +    request->handler = handler;
>> +    request->sector_num = sector_num;
>> +    request->qiov = qiov;
>> +    request->nb_sectors = nb_sectors;
>> +    request->cb = cb;
>> +    request->opaque = opaque;
>> +
>> +    QTAILQ_INSERT_TAIL(&queue->requests, request, entry);
>> +
>> +    acb = qemu_aio_get(&block_queue_pool, bs,
>> +                       qemu_block_queue_callback, opaque);
>> +
>> +    return acb;
>> +}
>> +
>> +int qemu_block_queue_handler(BlockIORequest *request)
>> +{
>> +    int ret;
>> +    BlockDriverAIOCB *res;
>> +
>> +    /* indicate this req is from block queue */
>> +    request->bs->req_from_queue = true;
>> +
>> +    res = request->handler(request->bs, request->sector_num,
>> +                             request->qiov, request->nb_sectors,
>> +                             request->cb, request->opaque);
>> +
>> +    ret = (res == NULL) ? 0 : 1;
>> +
>> +    return ret;
>> +}
>> diff --git a/block/blk-queue.h b/block/blk-queue.h
>> new file mode 100644
>> index 0000000..47f8a36
>> --- /dev/null
>> +++ b/block/blk-queue.h
>> @@ -0,0 +1,70 @@
>> +/*
>> + * QEMU System Emulator queue declaration for block layer
>> + *
>> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> + * of this software and associated documentation files (the "Software"), to deal
>> + * in the Software without restriction, including without limitation the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> + * copies of the Software, and to permit persons to whom the Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> + * THE SOFTWARE.
>> + */
>> +
>> +#ifndef QEMU_BLOCK_QUEUE_H
>> +#define QEMU_BLOCK_QUEUE_H
>> +
>> +#include "block.h"
>> +#include "qemu-queue.h"
>> +#include "qemu-common.h"
>> +
>> +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs,
>> +                                int64_t sector_num, QEMUIOVector *qiov,
>> +                                int nb_sectors, BlockDriverCompletionFunc *cb,
>> +                                void *opaque);
>> +
>> +struct BlockIORequest {
>> +    QTAILQ_ENTRY(BlockIORequest) entry;
>> +    BlockDriverState *bs;
>> +    BlockRequestHandler *handler;
>> +    int64_t sector_num;
>> +    QEMUIOVector *qiov;
>> +    int nb_sectors;
>> +    BlockDriverCompletionFunc *cb;
>> +    void *opaque;
>> +};
>> +
>> +typedef struct BlockIORequest BlockIORequest;
>> +
>> +struct BlockQueue {
>> +    QTAILQ_HEAD(requests, BlockIORequest) requests;
>> +};
>> +
>> +typedef struct BlockQueue BlockQueue;
>> +
>> +BlockQueue *qemu_new_block_queue(void);
>> +
>> +void qemu_del_block_queue(BlockQueue *queue);
>> +
>> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
>> +                        BlockDriverState *bs,
>> +                        BlockRequestHandler *handler,
>> +                        int64_t sector_num,
>> +                        QEMUIOVector *qiov,
>> +                        int nb_sectors,
>> +                        BlockDriverCompletionFunc *cb,
>> +                        void *opaque);
>> +
>> +int qemu_block_queue_handler(BlockIORequest *request);
>> +#endif /* QEMU_BLOCK_QUEUE_H */
>> diff --git a/block_int.h b/block_int.h
>> index 1e265d2..1587171 100644
>> --- a/block_int.h
>> +++ b/block_int.h
>> @@ -27,10 +27,17 @@
>>  #include "block.h"
>>  #include "qemu-option.h"
>>  #include "qemu-queue.h"
>> +#include "block/blk-queue.h"
>>
>>  #define BLOCK_FLAG_ENCRYPT   1
>>  #define BLOCK_FLAG_COMPAT6   4
>>
>> +#define BLOCK_IO_LIMIT_READ     0
>> +#define BLOCK_IO_LIMIT_WRITE    1
>> +#define BLOCK_IO_LIMIT_TOTAL    2
>> +
>> +#define BLOCK_IO_SLICE_TIME     100000000
>> +
>>  #define BLOCK_OPT_SIZE          "size"
>>  #define BLOCK_OPT_ENCRYPT       "encryption"
>>  #define BLOCK_OPT_COMPAT6       "compat6"
>> @@ -46,6 +53,16 @@ typedef struct AIOPool {
>>      BlockDriverAIOCB *free_aiocb;
>>  } AIOPool;
>>
>> +typedef struct BlockIOLimit {
>> +    uint64_t bps[3];
>> +    uint64_t iops[3];
>> +} BlockIOLimit;
>> +
>> +typedef struct BlockIODisp {
>> +    uint64_t bytes[2];
>> +    uint64_t ios[2];
>> +} BlockIODisp;
>> +
>>  struct BlockDriver {
>>      const char *format_name;
>>      int instance_size;
>> @@ -175,6 +192,14 @@ struct BlockDriverState {
>>
>>      void *sync_aiocb;
>>
>> +    /* the time for latest disk I/O */
>> +    int64_t slice_start[2];
>> +    BlockIOLimit io_limits;
>> +    BlockIODisp  io_disps;
>> +    BlockQueue   *block_queue;
>> +    QEMUTimer    *block_timer;
>> +    bool         req_from_queue;
>> +
>>      /* I/O stats (display with "info blockstats"). */
>>      uint64_t rd_bytes;
>>      uint64_t wr_bytes;
>> @@ -222,6 +247,9 @@ void qemu_aio_release(void *p);
>>
>>  void *qemu_blockalign(BlockDriverState *bs, size_t size);
>>
>> +void bdrv_set_io_limits(BlockDriverState *bs,
>> +                            BlockIOLimit *io_limits);
>> +
>>  #ifdef _WIN32
>>  int is_windows_drive(const char *filename);
>>  #endif
>> diff --git a/blockdev.c b/blockdev.c
>> index c263663..45602f4 100644
>> --- a/blockdev.c
>> +++ b/blockdev.c
>> @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>>      int on_read_error, on_write_error;
>>      const char *devaddr;
>>      DriveInfo *dinfo;
>> +    BlockIOLimit io_limits;
>> +    bool iol_flag = false;
>> +    const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"};
>>      int is_extboot = 0;
>>      int snapshot = 0;
>>      int ret;
>> @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>>          return NULL;
>>      }
>>
>> +    /* disk io limits */
>> +    iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts);
>> +    if (iol_flag) {
>> +        memset(&io_limits, 0, sizeof(BlockIOLimit));
>> +
>> +        io_limits.bps[2]  = qemu_opt_get_number(opts, "bps", 0);
>> +        io_limits.bps[0]  = qemu_opt_get_number(opts, "bps_rd", 0);
>> +        io_limits.bps[1]  = qemu_opt_get_number(opts, "bps_wr", 0);
>> +        io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0);
>> +        io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0);
>> +        io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0);
>> +    }
>> +
>>      on_write_error = BLOCK_ERR_STOP_ENOSPC;
>>      if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
>>          if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
>> @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>>
>>      bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
>>
>> +    /* throttling disk io limits */
>> +    if (iol_flag) {
>> +        bdrv_set_io_limits(dinfo->bdrv, &io_limits);
>> +    }
>> +
>>      switch(type) {
>>      case IF_IDE:
>>      case IF_SCSI:
>> diff --git a/qemu-config.c b/qemu-config.c
>> index efa892c..9232bbb 100644
>> --- a/qemu-config.c
>> +++ b/qemu-config.c
>> @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = {
>>              .name = "boot",
>>              .type = QEMU_OPT_BOOL,
>>              .help = "make this a boot drive",
>> +        },{
>> +            .name = "iops",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "limit total I/O operations per second",
>> +        },{
>> +            .name = "iops_rd",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "limit read operations per second",
>> +        },{
>> +            .name = "iops_wr",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "limit write operations per second",
>> +        },{
>> +            .name = "bps",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "limit total bytes per second",
>> +        },{
>> +            .name = "bps_rd",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "limit read bytes per second",
>> +        },{
>> +            .name = "bps_wr",
>> +            .type = QEMU_OPT_NUMBER,
>> +            .help = "limit write bytes per second",
>>          },
>>          { /* end of list */ }
>>      },
>> diff --git a/qemu-option.c b/qemu-option.c
>> index 65db542..9fe234d 100644
>> --- a/qemu-option.c
>> +++ b/qemu-option.c
>> @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
>>      return opt->value.uint;
>>  }
>>
>> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts)
>> +{
>> +     int i;
>> +     uint64_t opt_val   = 0;
>> +     bool iol_flag = false;
>> +
>> +     for (i = 0; iol_opts[i]; i++) {
>> +      opt_val = qemu_opt_get_number(opts, iol_opts[i], 0);
>> +      if (opt_val != 0) {
>> +          iol_flag = true;
>> +          break;
>> +      }
>> +     }
>> +
>> +     return iol_flag;
>> +}
>> +
>>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
>>  {
>>      QemuOpt *opt = qemu_opt_find(opts, name);
>> diff --git a/qemu-option.h b/qemu-option.h
>> index b515813..fc909f9 100644
>> --- a/qemu-option.h
>> +++ b/qemu-option.h
>> @@ -107,6 +107,7 @@ struct QemuOptsList {
>>  const char *qemu_opt_get(QemuOpts *opts, const char *name);
>>  int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval);
>>  uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval);
>> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts);
>>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval);
>>  int qemu_opt_set(QemuOpts *opts, const char *name, const char *value);
>>  typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque);
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index cb3347e..ae219f5 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>>      "       [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
>>      "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
>>      "       [,readonly=on|off][,boot=on|off]\n"
>> +    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
>>      "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>>  STEXI
>>  @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
>> --
>> 1.7.2.3
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Stefan Hajnoczi - July 27, 2011, 12:58 p.m.
On Wed, Jul 27, 2011 at 11:17 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>>> Welcome to give me your comments, thanks.
>>>
>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>> ---
>>>  Makefile.objs     |    2 +-
>>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>  block.h           |    1 -
>>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>>  block/blk-queue.h |   70 +++++++++++++
>>>  block_int.h       |   28 +++++
>>>  blockdev.c        |   21 ++++
>>>  qemu-config.c     |   24 +++++
>>>  qemu-option.c     |   17 +++
>>>  qemu-option.h     |    1 +
>>>  qemu-options.hx   |    1 +
>>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>>  create mode 100644 block/blk-queue.c
>>>  create mode 100644 block/blk-queue.h
>>>
>>> diff --git a/Makefile.objs b/Makefile.objs
>>> index 9f99ed4..06f2033 100644
>>> --- a/Makefile.objs
>>> +++ b/Makefile.objs
>>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>  block-nested-y += qed-check.o
>>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>>  block-nested-$(CONFIG_CURL) += curl.o
>>> diff --git a/block.c b/block.c
>>> index 24a25d5..e54e59c 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -29,6 +29,9 @@
>>>  #include "module.h"
>>>  #include "qemu-objects.h"
>>>
>>> +#include "qemu-timer.h"
>>> +#include "block/blk-queue.h"
>>> +
>>>  #ifdef CONFIG_BSD
>>>  #include <sys/types.h>
>>>  #include <sys/stat.h>
>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>                           const uint8_t *buf, int nb_sectors);
>>>
>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>> +        double elapsed_time, uint64_t *wait);
>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>> +        bool is_write, uint64_t *wait);
>>> +
>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>
>>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>>  }
>>>  #endif
>>>
>>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>>> +{
>>> +    if ((io_limits->bps[0] == 0)
>>> +         && (io_limits->bps[1] == 0)
>>> +         && (io_limits->bps[2] == 0)
>>> +         && (io_limits->iops[0] == 0)
>>> +         && (io_limits->iops[1] == 0)
>>> +         && (io_limits->iops[2] == 0)) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    return 1;
>>> +}
>>> +
>>>  /* check if the path starts with "<protocol>:" */
>>>  static int path_has_protocol(const char *path)
>>>  {
>>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>>      }
>>>  }
>>>
>>> +static void bdrv_block_timer(void *opaque)
>>> +{
>>> +    BlockDriverState *bs = opaque;
>>> +    BlockQueue *queue = bs->block_queue;
>>> +
>>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>>> +        BlockIORequest *request;
>>> +        int ret;
>>> +
>>> +        request = QTAILQ_FIRST(&queue->requests);
>>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>>> +
>>> +        ret = qemu_block_queue_handler(request);
>>> +        if (ret == 0) {
>>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>>> +            break;
>>> +        }
>>> +
>>> +        qemu_free(request);
>>> +    }
>>> +}
>>> +
>>>  void bdrv_register(BlockDriver *bdrv)
>>>  {
>>>      if (!bdrv->bdrv_aio_readv) {
>>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>>      }
>>>
>>> +    /* throttling disk I/O limits */
>>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>>> +        bs->block_queue = qemu_new_block_queue();
>>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>> +
>>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>>> +    }
>>> +
>>
>> It should be possible to tune the limits on the flight, please introduce
>> QMP commands for that.
> Yeah, I am working on this.

It's worth mentioning that the I/O limits commands can use Supriya's
new block_set command for changing block device parameters at runtime.
 So I think the runtime limits changing can be a separate patch.

Stefan
Yoder Stuart-B08248 - July 27, 2011, 1:13 p.m.
> -----Original Message-----
> From: Anthony Liguori [mailto:aliguori@us.ibm.com]
> Sent: Tuesday, July 26, 2011 5:10 PM
> To: Yoder Stuart-B08248
> Cc: qemu-devel@nongnu.org
> Subject: Re: [PATCH 04/25] Add hard build dependency on glib
> 
> On 07/26/2011 04:51 PM, Yoder Stuart-B08248 wrote:
> >
> > I am having issues with this in a cross compilation
> > environment.   In Power embedded, almost all our
> > development is using cross toolchains.
> >
> > Neither glib or pkg-config are in our cross build environment and I'm
> > having issues getting them built and installed.
> > Not even sure if pkg-config is even supposed to work in a cross
> > development environment...I'm new to that tool and poking around a bit
> > with google raises some questions.
> 
> You're probably setting up your cross environment incorrectly which, unfortunately, is very
> common.

I got glib to build without too much trouble, however, 'make install' tries to
re-link some stuff and at that point there seems to be a bug somewhere where libtool
fails to use the correct CFLAGS and PATH, and thus the make install partially
installs glib before erroring out.

> The proper thing to do is to have GCC use a different system include directory and a different
> prefix.  That will result in a directory where there are gcc binaries with normal names
> installed in ${cross_prefix}/bin
> 
> You need to build and install pkg-config to this prefix too, and then when it comes time to
> actually doing the QEMU configure, you should do something like:
> 
> export PATH=${cross_prefix}/bin:$PATH
> export PKG_CONFIG_PATH=${cross_prefix}/lib/pkg-config:$PKG_CONFIG_PATH
> 
> Many automated cross compiler environment scripts will install specially named versions of gcc
> and binutils in your normal $PATH.  The trouble is, this is a bit of a hack and unless you
> know to make this hack work with other build tools, it all comes tumbling down.

Note-- this is not just a matter of me getting this to work in my
own private build environment, I'm working with a cross toolchain
that gets delivered to our customers that I have little control
over, and I need to get it working in that environment.

Looks like our cross tools have both a specially named version of the tool
(e.g. powerpc-linux-gnu-gcc) and plain (e.g. gcc).  Unfortunately the plain
version of the tools don't seem to be functional (and have never been used as
far as I know).

Will keep fiddling with this...

Stuart
David Gibson - July 27, 2011, 1:31 p.m.
On Wed, Jul 27, 2011 at 06:54:09PM +1000, Benjamin Herrenschmidt wrote:
> 
> > You're probably setting up your cross environment incorrectly which, 
> > unfortunately, is very common.
> > 
> > The proper thing to do is to have GCC use a different system include 
> > directory and a different prefix.  That will result in a directory where 
> > there are gcc binaries with normal names installed in ${cross_prefix}/bin
> > 
> > You need to build and install pkg-config to this prefix too, and then 
> > when it comes time to actually doing the QEMU configure, you should do 
> > something like:
> > 
> > export PATH=${cross_prefix}/bin:$PATH
> > export PKG_CONFIG_PATH=${cross_prefix}/lib/pkg-config:$PKG_CONFIG_PATH
> > 
> > Many automated cross compiler environment scripts will install specially 
> > named versions of gcc and binutils in your normal $PATH.  The trouble 
> > is, this is a bit of a hack and unless you know to make this hack work 
> > with other build tools, it all comes tumbling down.

We're not, as a rule, cross building.  We're doing compiles of ppc64
binaries on a ppc32.  Although that can be approached as a
cross-build, it's a common enough special case that it should be able
to handle this without setting a full cross-build environment.  At the
moment this does seem to work for building x86_64 binaries on a 32-bit
x86 system, but I suspect this is only accident.

> Well, that hard requirement is causing us problem on our 32/64-bit cross
> builds as well.
> 
> It looks like glib (at least recent versions in -sid) can't be built
> 64-bit on a 32-bit system :-( At least not without fixing some horrid
> bugs in there related to some generated include path from what David
> says (I'll let him comment further).

Actually, I think it can (provided a 64-bit glib is installed,
including a 64-bit version of glibconfig.h), and it's not *as* painful
to set up as I previously thought, although it's still not nice.

> In general, every time you add a library requirement without some config
> option to disable it for cases such as ours, you add pain :-)
> 
> Now, in the specific case of glib, I understand why you would want to
> get rid of re-invented wheels and use it, so I'm not specifically
> criticizing that specific change, we'll eventually have to fix it
> anyways. Just a heads up to be careful with hard requirements in
> general.
> 
> Cheers,
> Ben.
> 
>
Marcelo Tosatti - July 27, 2011, 3:49 p.m.
On Wed, Jul 27, 2011 at 06:17:15PM +0800, Zhi Yong Wu wrote:
> >> +        wait_time = 1;
> >> +    }
> >> +
> >> +    wait_time = wait_time + (slice_time - elapsed_time);
> >> +    if (wait) {
> >> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
> >> +    }
> >
> > The guest can keep submitting requests where "wait_time = 1" above,
> > and the timer will be rearmed continuously in the future.

This is wrong.

>  Can't you
> > simply arm the timer to the next slice start? _Some_ data must be
> > transfered by then, anyway (and nothing can be transfered earlier than
> > that).

This is valid.

> Sorry, i have got what you mean. Can you elaborate in more detail?

Sorry, the bug i mentioned about timer being rearmed does not exist.

But arming the timer for the last request as its done is confusing/unnecessary.

The timer processes queued requests, but the timer is armed accordingly
to the last queued request in the slice. For example, if a request is
submitted 1ms before the slice ends, the timer is armed 100ms +
(slice_time - elapsed_time) in the future.

> > Same for iops calculation below.
> >
> >> +
> >> +    return true;
> >> +}
> >> +
> >> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
> >> +                             double elapsed_time, uint64_t *wait) {
> >> +    uint64_t iops_limit = 0;
> >> +    double   ios_limit, ios_disp;
> >> +    double   slice_time = 0.1, wait_time;
> >> +
> >> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> >> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
> >> +    } else if (bs->io_limits.iops[is_write]) {
> >> +        iops_limit = bs->io_limits.iops[is_write];
> >> +    } else {
> >> +        if (wait) {
> >> +            *wait = 0;
> >> +        }
> >> +
> >> +        return false;
> >> +    }
> >> +
> >> +    ios_limit = iops_limit * slice_time;
> >> +    ios_disp  = bs->io_disps.ios[is_write];
> >> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> >> +        ios_disp += bs->io_disps.ios[!is_write];
> >> +    }
> >> +
> >> +    if (ios_disp + 1 <= ios_limit) {
> >> +     if (wait) {
> >> +         *wait = 0;
> >> +     }
> >> +
> >> +        return false;
> >> +    }
> >> +
> >> +    /* Calc approx time to dispatch */
> >> +    wait_time = (ios_disp + 1) / iops_limit;
> >> +    if (wait_time > elapsed_time) {
> >> +     wait_time = wait_time - elapsed_time;
> >> +    } else {
> >> +     wait_time = 0;
> >> +    }
> >> +
> >> +    if (wait) {
> >> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
> >> +    }
> >> +
> >> +    return true;
> >> +}
> >> +
> >> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
> >> +                           bool is_write, uint64_t *wait) {
> >> +    int64_t  real_time;
> >> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
> >> +    double   elapsed_time;
> >> +    int      bps_ret, iops_ret;
> >> +
> >> +    real_time = qemu_get_clock_ns(vm_clock);
> >> +    if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) {
> >> +        bs->slice_start[is_write] = real_time;
> >> +
> >> +        bs->io_disps.bytes[is_write]   = 0;
> >> +        bs->io_disps.bytes[!is_write]  = 0;
> >> +
> >> +        bs->io_disps.ios[is_write]     = 0;
> >> +        bs->io_disps.ios[!is_write]    = 0;
> >> +    }
> >> +
> >> +    /* If a limit was exceeded, immediately queue this request */
> >> +    if ((bs->req_from_queue == false)
> >> +        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
> >> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
> >> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
> >> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> >> +            if (wait) {
> >> +                *wait = 0;
> >> +            }
> >> +
> >> +            return true;
> >> +        }
> >> +    }
> >> +
> >> +    elapsed_time  = real_time - bs->slice_start[is_write];
> >> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
> >> +
> >> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
> >> +                                      is_write, elapsed_time, &bps_wait);
> >> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
> >> +                                      elapsed_time, &iops_wait);
> >> +    if (bps_ret || iops_ret) {
> >> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
> >> +        if (wait) {
> >> +            *wait = max_wait;
> >> +        }
> >> +
> >> +        return true;
> >> +    }
> >> +
> >> +    if (wait) {
> >> +        *wait = 0;
> >> +    }
> >> +
> >> +    return false;
> >> +}
> >>
> >>  /**************************************************************/
> >>  /* async I/Os */
> >> @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
> >>  {
> >>      BlockDriver *drv = bs->drv;
> >>      BlockDriverAIOCB *ret;
> >> +    uint64_t wait_time = 0;
> >>
> >>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
> >>
> >> -    if (!drv)
> >> -        return NULL;
> >> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> >> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +            bs->req_from_queue = false;
> >> +        }
> >>          return NULL;
> >> +    }
> >> +
> >> +    /* throttling disk read I/O */
> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
> >> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
> >> +                              sector_num, qiov, nb_sectors, cb, opaque);
> >> +            qemu_mod_timer(bs->block_timer,
> >> +                       wait_time + qemu_get_clock_ns(vm_clock));
> >> +            bs->req_from_queue = false;
> >> +            return ret;
> >> +        }
> >> +    }
> >>
> >>      ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
> >>                                cb, opaque);
> >> @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
> >>       /* Update stats even though technically transfer has not happened. */
> >>       bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> >>       bs->rd_ops ++;
> >> +
> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
> >> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> >> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
> >> +        }
> >> +    }
> >> +
> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +        bs->req_from_queue = false;
> >>      }
> >>
> >>      return ret;
> >> @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
> >>      BlockDriver *drv = bs->drv;
> >>      BlockDriverAIOCB *ret;
> >>      BlockCompleteData *blk_cb_data;
> >> +    uint64_t wait_time = 0;
> >>
> >>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
> >>
> >> -    if (!drv)
> >> -        return NULL;
> >> -    if (bs->read_only)
> >> -        return NULL;
> >> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
> >> +    if (!drv || bs->read_only
> >> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +            bs->req_from_queue = false;
> >> +        }
> >> +
> >>          return NULL;
> >> +    }
> >>
> >>      if (bs->dirty_bitmap) {
> >>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
> >> @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
> >>          opaque = blk_cb_data;
> >>      }
> >>
> >> +    /* throttling disk write I/O */
> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
> >> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
> >> +                                  sector_num, qiov, nb_sectors, cb, opaque);
> >> +            qemu_mod_timer(bs->block_timer,
> >> +                                  wait_time + qemu_get_clock_ns(vm_clock));
> >> +            bs->req_from_queue = false;
> >> +            return ret;
> >> +        }
> >> +    }
> >> +
> >>      ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
> >>                                 cb, opaque);
> >>
> >> @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
> >>          if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
> >>              bs->wr_highest_sector = sector_num + nb_sectors - 1;
> >>          }
> >> +
> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
> >> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> >> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
> >> +        }
> >> +    }
> >> +
> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
> >> +        bs->req_from_queue = false;
> >>      }
> >>
> >>      return ret;
> >> diff --git a/block.h b/block.h
> >> index 859d1d9..f0dac62 100644
> >> --- a/block.h
> >> +++ b/block.h
> >> @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
> >>      const char *backing_file, const char *backing_fmt);
> >>  void bdrv_register(BlockDriver *bdrv);
> >>
> >> -
> >>  typedef struct BdrvCheckResult {
> >>      int corruptions;
> >>      int leaks;
> >> diff --git a/block/blk-queue.c b/block/blk-queue.c
> >> new file mode 100644
> >> index 0000000..09fcfe9
> >> --- /dev/null
> >> +++ b/block/blk-queue.c
> >> @@ -0,0 +1,116 @@
> >> +/*
> >> + * QEMU System Emulator queue definition for block layer
> >> + *
> >> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
> >> + *
> >> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> >> + * of this software and associated documentation files (the "Software"), to deal
> >> + * in the Software without restriction, including without limitation the rights
> >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> >> + * copies of the Software, and to permit persons to whom the Software is
> >> + * furnished to do so, subject to the following conditions:
> >> + *
> >> + * The above copyright notice and this permission notice shall be included in
> >> + * all copies or substantial portions of the Software.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >> + * THE SOFTWARE.
> >> + */
> >> +
> >> +#include "block_int.h"
> >> +#include "qemu-queue.h"
> >> +#include "block/blk-queue.h"
> >> +
> >> +/* The APIs for block request queue on qemu block layer.
> >> + */
> >> +
> >> +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb)
> >> +{
> >> +    qemu_aio_release(acb);
> >> +}
> >> +
> >> +static AIOPool block_queue_pool = {
> >> +    .aiocb_size         = sizeof(struct BlockDriverAIOCB),
> >> +    .cancel             = qemu_block_queue_cancel,
> >> +};
> >> +
> >> +static void qemu_block_queue_callback(void *opaque, int ret)
> >> +{
> >> +    BlockDriverAIOCB *acb = opaque;
> >> +
> >> +    qemu_aio_release(acb);
> >> +}
> >> +
> >> +BlockQueue *qemu_new_block_queue(void)
> >> +{
> >> +    BlockQueue *queue;
> >> +
> >> +    queue = qemu_mallocz(sizeof(BlockQueue));
> >> +
> >> +    QTAILQ_INIT(&queue->requests);
> >> +
> >> +    return queue;
> >> +}
> >> +
> >> +void qemu_del_block_queue(BlockQueue *queue)
> >> +{
> >> +    BlockIORequest *request, *next;
> >> +
> >> +    QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) {
> >> +        QTAILQ_REMOVE(&queue->requests, request, entry);
> >> +        qemu_free(request);
> >> +    }
> >> +
> >> +    qemu_free(queue);
> >> +}
> >> +
> >> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
> >> +                     BlockDriverState *bs,
> >> +                     BlockRequestHandler *handler,
> >> +                     int64_t sector_num,
> >> +                     QEMUIOVector *qiov,
> >> +                     int nb_sectors,
> >> +                     BlockDriverCompletionFunc *cb,
> >> +                     void *opaque)
> >> +{
> >> +    BlockIORequest *request;
> >> +    BlockDriverAIOCB *acb;
> >> +
> >> +    request = qemu_malloc(sizeof(BlockIORequest));
> >> +    request->bs = bs;
> >> +    request->handler = handler;
> >> +    request->sector_num = sector_num;
> >> +    request->qiov = qiov;
> >> +    request->nb_sectors = nb_sectors;
> >> +    request->cb = cb;
> >> +    request->opaque = opaque;
> >> +
> >> +    QTAILQ_INSERT_TAIL(&queue->requests, request, entry);
> >> +
> >> +    acb = qemu_aio_get(&block_queue_pool, bs,
> >> +                       qemu_block_queue_callback, opaque);
> >> +
> >> +    return acb;
> >> +}
> >> +
> >> +int qemu_block_queue_handler(BlockIORequest *request)
> >> +{
> >> +    int ret;
> >> +    BlockDriverAIOCB *res;
> >> +
> >> +    /* indicate this req is from block queue */
> >> +    request->bs->req_from_queue = true;
> >> +
> >> +    res = request->handler(request->bs, request->sector_num,
> >> +                             request->qiov, request->nb_sectors,
> >> +                             request->cb, request->opaque);
> >> +
> >> +    ret = (res == NULL) ? 0 : 1;
> >> +
> >> +    return ret;
> >> +}
> >> diff --git a/block/blk-queue.h b/block/blk-queue.h
> >> new file mode 100644
> >> index 0000000..47f8a36
> >> --- /dev/null
> >> +++ b/block/blk-queue.h
> >> @@ -0,0 +1,70 @@
> >> +/*
> >> + * QEMU System Emulator queue declaration for block layer
> >> + *
> >> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
> >> + *
> >> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> >> + * of this software and associated documentation files (the "Software"), to deal
> >> + * in the Software without restriction, including without limitation the rights
> >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> >> + * copies of the Software, and to permit persons to whom the Software is
> >> + * furnished to do so, subject to the following conditions:
> >> + *
> >> + * The above copyright notice and this permission notice shall be included in
> >> + * all copies or substantial portions of the Software.
> >> + *
> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> >> + * THE SOFTWARE.
> >> + */
> >> +
> >> +#ifndef QEMU_BLOCK_QUEUE_H
> >> +#define QEMU_BLOCK_QUEUE_H
> >> +
> >> +#include "block.h"
> >> +#include "qemu-queue.h"
> >> +#include "qemu-common.h"
> >> +
> >> +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs,
> >> +                                int64_t sector_num, QEMUIOVector *qiov,
> >> +                                int nb_sectors, BlockDriverCompletionFunc *cb,
> >> +                                void *opaque);
> >> +
> >> +struct BlockIORequest {
> >> +    QTAILQ_ENTRY(BlockIORequest) entry;
> >> +    BlockDriverState *bs;
> >> +    BlockRequestHandler *handler;
> >> +    int64_t sector_num;
> >> +    QEMUIOVector *qiov;
> >> +    int nb_sectors;
> >> +    BlockDriverCompletionFunc *cb;
> >> +    void *opaque;
> >> +};
> >> +
> >> +typedef struct BlockIORequest BlockIORequest;
> >> +
> >> +struct BlockQueue {
> >> +    QTAILQ_HEAD(requests, BlockIORequest) requests;
> >> +};
> >> +
> >> +typedef struct BlockQueue BlockQueue;
> >> +
> >> +BlockQueue *qemu_new_block_queue(void);
> >> +
> >> +void qemu_del_block_queue(BlockQueue *queue);
> >> +
> >> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
> >> +                        BlockDriverState *bs,
> >> +                        BlockRequestHandler *handler,
> >> +                        int64_t sector_num,
> >> +                        QEMUIOVector *qiov,
> >> +                        int nb_sectors,
> >> +                        BlockDriverCompletionFunc *cb,
> >> +                        void *opaque);
> >> +
> >> +int qemu_block_queue_handler(BlockIORequest *request);
> >> +#endif /* QEMU_BLOCK_QUEUE_H */
> >> diff --git a/block_int.h b/block_int.h
> >> index 1e265d2..1587171 100644
> >> --- a/block_int.h
> >> +++ b/block_int.h
> >> @@ -27,10 +27,17 @@
> >>  #include "block.h"
> >>  #include "qemu-option.h"
> >>  #include "qemu-queue.h"
> >> +#include "block/blk-queue.h"
> >>
> >>  #define BLOCK_FLAG_ENCRYPT   1
> >>  #define BLOCK_FLAG_COMPAT6   4
> >>
> >> +#define BLOCK_IO_LIMIT_READ     0
> >> +#define BLOCK_IO_LIMIT_WRITE    1
> >> +#define BLOCK_IO_LIMIT_TOTAL    2
> >> +
> >> +#define BLOCK_IO_SLICE_TIME     100000000
> >> +
> >>  #define BLOCK_OPT_SIZE          "size"
> >>  #define BLOCK_OPT_ENCRYPT       "encryption"
> >>  #define BLOCK_OPT_COMPAT6       "compat6"
> >> @@ -46,6 +53,16 @@ typedef struct AIOPool {
> >>      BlockDriverAIOCB *free_aiocb;
> >>  } AIOPool;
> >>
> >> +typedef struct BlockIOLimit {
> >> +    uint64_t bps[3];
> >> +    uint64_t iops[3];
> >> +} BlockIOLimit;
> >> +
> >> +typedef struct BlockIODisp {
> >> +    uint64_t bytes[2];
> >> +    uint64_t ios[2];
> >> +} BlockIODisp;
> >> +
> >>  struct BlockDriver {
> >>      const char *format_name;
> >>      int instance_size;
> >> @@ -175,6 +192,14 @@ struct BlockDriverState {
> >>
> >>      void *sync_aiocb;
> >>
> >> +    /* the time for latest disk I/O */
> >> +    int64_t slice_start[2];
> >> +    BlockIOLimit io_limits;
> >> +    BlockIODisp  io_disps;
> >> +    BlockQueue   *block_queue;
> >> +    QEMUTimer    *block_timer;
> >> +    bool         req_from_queue;
> >> +
> >>      /* I/O stats (display with "info blockstats"). */
> >>      uint64_t rd_bytes;
> >>      uint64_t wr_bytes;
> >> @@ -222,6 +247,9 @@ void qemu_aio_release(void *p);
> >>
> >>  void *qemu_blockalign(BlockDriverState *bs, size_t size);
> >>
> >> +void bdrv_set_io_limits(BlockDriverState *bs,
> >> +                            BlockIOLimit *io_limits);
> >> +
> >>  #ifdef _WIN32
> >>  int is_windows_drive(const char *filename);
> >>  #endif
> >> diff --git a/blockdev.c b/blockdev.c
> >> index c263663..45602f4 100644
> >> --- a/blockdev.c
> >> +++ b/blockdev.c
> >> @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
> >>      int on_read_error, on_write_error;
> >>      const char *devaddr;
> >>      DriveInfo *dinfo;
> >> +    BlockIOLimit io_limits;
> >> +    bool iol_flag = false;
> >> +    const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"};
> >>      int is_extboot = 0;
> >>      int snapshot = 0;
> >>      int ret;
> >> @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
> >>          return NULL;
> >>      }
> >>
> >> +    /* disk io limits */
> >> +    iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts);
> >> +    if (iol_flag) {
> >> +        memset(&io_limits, 0, sizeof(BlockIOLimit));
> >> +
> >> +        io_limits.bps[2]  = qemu_opt_get_number(opts, "bps", 0);
> >> +        io_limits.bps[0]  = qemu_opt_get_number(opts, "bps_rd", 0);
> >> +        io_limits.bps[1]  = qemu_opt_get_number(opts, "bps_wr", 0);
> >> +        io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0);
> >> +        io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0);
> >> +        io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0);
> >> +    }
> >> +
> >>      on_write_error = BLOCK_ERR_STOP_ENOSPC;
> >>      if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
> >>          if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
> >> @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
> >>
> >>      bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
> >>
> >> +    /* throttling disk io limits */
> >> +    if (iol_flag) {
> >> +        bdrv_set_io_limits(dinfo->bdrv, &io_limits);
> >> +    }
> >> +
> >>      switch(type) {
> >>      case IF_IDE:
> >>      case IF_SCSI:
> >> diff --git a/qemu-config.c b/qemu-config.c
> >> index efa892c..9232bbb 100644
> >> --- a/qemu-config.c
> >> +++ b/qemu-config.c
> >> @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = {
> >>              .name = "boot",
> >>              .type = QEMU_OPT_BOOL,
> >>              .help = "make this a boot drive",
> >> +        },{
> >> +            .name = "iops",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "limit total I/O operations per second",
> >> +        },{
> >> +            .name = "iops_rd",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "limit read operations per second",
> >> +        },{
> >> +            .name = "iops_wr",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "limit write operations per second",
> >> +        },{
> >> +            .name = "bps",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "limit total bytes per second",
> >> +        },{
> >> +            .name = "bps_rd",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "limit read bytes per second",
> >> +        },{
> >> +            .name = "bps_wr",
> >> +            .type = QEMU_OPT_NUMBER,
> >> +            .help = "limit write bytes per second",
> >>          },
> >>          { /* end of list */ }
> >>      },
> >> diff --git a/qemu-option.c b/qemu-option.c
> >> index 65db542..9fe234d 100644
> >> --- a/qemu-option.c
> >> +++ b/qemu-option.c
> >> @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
> >>      return opt->value.uint;
> >>  }
> >>
> >> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts)
> >> +{
> >> +     int i;
> >> +     uint64_t opt_val   = 0;
> >> +     bool iol_flag = false;
> >> +
> >> +     for (i = 0; iol_opts[i]; i++) {
> >> +      opt_val = qemu_opt_get_number(opts, iol_opts[i], 0);
> >> +      if (opt_val != 0) {
> >> +          iol_flag = true;
> >> +          break;
> >> +      }
> >> +     }
> >> +
> >> +     return iol_flag;
> >> +}
> >> +
> >>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
> >>  {
> >>      QemuOpt *opt = qemu_opt_find(opts, name);
> >> diff --git a/qemu-option.h b/qemu-option.h
> >> index b515813..fc909f9 100644
> >> --- a/qemu-option.h
> >> +++ b/qemu-option.h
> >> @@ -107,6 +107,7 @@ struct QemuOptsList {
> >>  const char *qemu_opt_get(QemuOpts *opts, const char *name);
> >>  int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval);
> >>  uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval);
> >> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts);
> >>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval);
> >>  int qemu_opt_set(QemuOpts *opts, const char *name, const char *value);
> >>  typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque);
> >> diff --git a/qemu-options.hx b/qemu-options.hx
> >> index cb3347e..ae219f5 100644
> >> --- a/qemu-options.hx
> >> +++ b/qemu-options.hx
> >> @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
> >>      "       [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
> >>      "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
> >>      "       [,readonly=on|off][,boot=on|off]\n"
> >> +    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
> >>      "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
> >>  STEXI
> >>  @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
> >> --
> >> 1.7.2.3
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> 
> 
> -- 
> Regards,
> 
> Zhi Yong Wu
Zhiyong Wu - July 28, 2011, 2:35 a.m.
On Wed, Jul 27, 2011 at 8:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Jul 27, 2011 at 11:17 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>>>> Welcome to give me your comments, thanks.
>>>>
>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>> ---
>>>>  Makefile.objs     |    2 +-
>>>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>  block.h           |    1 -
>>>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>>>  block/blk-queue.h |   70 +++++++++++++
>>>>  block_int.h       |   28 +++++
>>>>  blockdev.c        |   21 ++++
>>>>  qemu-config.c     |   24 +++++
>>>>  qemu-option.c     |   17 +++
>>>>  qemu-option.h     |    1 +
>>>>  qemu-options.hx   |    1 +
>>>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>>>  create mode 100644 block/blk-queue.c
>>>>  create mode 100644 block/blk-queue.h
>>>>
>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>> index 9f99ed4..06f2033 100644
>>>> --- a/Makefile.objs
>>>> +++ b/Makefile.objs
>>>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>>  block-nested-y += qed-check.o
>>>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>>>  block-nested-$(CONFIG_CURL) += curl.o
>>>> diff --git a/block.c b/block.c
>>>> index 24a25d5..e54e59c 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -29,6 +29,9 @@
>>>>  #include "module.h"
>>>>  #include "qemu-objects.h"
>>>>
>>>> +#include "qemu-timer.h"
>>>> +#include "block/blk-queue.h"
>>>> +
>>>>  #ifdef CONFIG_BSD
>>>>  #include <sys/types.h>
>>>>  #include <sys/stat.h>
>>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>>                           const uint8_t *buf, int nb_sectors);
>>>>
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>> +        double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, uint64_t *wait);
>>>> +
>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>
>>>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>>>  }
>>>>  #endif
>>>>
>>>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>>>> +{
>>>> +    if ((io_limits->bps[0] == 0)
>>>> +         && (io_limits->bps[1] == 0)
>>>> +         && (io_limits->bps[2] == 0)
>>>> +         && (io_limits->iops[0] == 0)
>>>> +         && (io_limits->iops[1] == 0)
>>>> +         && (io_limits->iops[2] == 0)) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    return 1;
>>>> +}
>>>> +
>>>>  /* check if the path starts with "<protocol>:" */
>>>>  static int path_has_protocol(const char *path)
>>>>  {
>>>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>>>      }
>>>>  }
>>>>
>>>> +static void bdrv_block_timer(void *opaque)
>>>> +{
>>>> +    BlockDriverState *bs = opaque;
>>>> +    BlockQueue *queue = bs->block_queue;
>>>> +
>>>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>>>> +        BlockIORequest *request;
>>>> +        int ret;
>>>> +
>>>> +        request = QTAILQ_FIRST(&queue->requests);
>>>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>>>> +
>>>> +        ret = qemu_block_queue_handler(request);
>>>> +        if (ret == 0) {
>>>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        qemu_free(request);
>>>> +    }
>>>> +}
>>>> +
>>>>  void bdrv_register(BlockDriver *bdrv)
>>>>  {
>>>>      if (!bdrv->bdrv_aio_readv) {
>>>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>>>      }
>>>>
>>>> +    /* throttling disk I/O limits */
>>>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>>>> +        bs->block_queue = qemu_new_block_queue();
>>>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>> +
>>>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>>>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>>>> +    }
>>>> +
>>>
>>> It should be possible to tune the limits on the flight, please introduce
>>> QMP commands for that.
>> Yeah, I am working on this.
>
> It's worth mentioning that the I/O limits commands can use Supriya's
> new block_set command for changing block device parameters at runtime.
>  So I think the runtime limits changing can be a separate patch.
OK
>
> Stefan
>
Zhiyong Wu - July 28, 2011, 4:24 a.m.
On Wed, Jul 27, 2011 at 11:49 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Wed, Jul 27, 2011 at 06:17:15PM +0800, Zhi Yong Wu wrote:
>> >> +        wait_time = 1;
>> >> +    }
>> >> +
>> >> +    wait_time = wait_time + (slice_time - elapsed_time);
>> >> +    if (wait) {
>> >> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
>> >> +    }
>> >
>> > The guest can keep submitting requests where "wait_time = 1" above,
>> > and the timer will be rearmed continuously in the future.
>
> This is wrong.
>
>>  Can't you
>> > simply arm the timer to the next slice start? _Some_ data must be
>> > transfered by then, anyway (and nothing can be transfered earlier than
>> > that).
>
> This is valid.
>
>> Sorry, i have got what you mean. Can you elaborate in more detail?
>
> Sorry, the bug i mentioned about timer being rearmed does not exist.
>
> But arming the timer for the last request as its done is confusing/unnecessary.
>
> The timer processes queued requests, but the timer is armed accordingly
> to the last queued request in the slice. For example, if a request is
> submitted 1ms before the slice ends, the timer is armed 100ms +
> (slice_time - elapsed_time) in the future.

If the timer is simply amred to the next slice start, this timer will
be a periodic timer, either the I/O rate can not be throttled under
the limits, or the enqueued request can be delayed to handled, this
will lower I/O rate seriously than the limits.

Maybe the slice time should be variable with current I/O rate. What do
you think of it?

>
>> > Same for iops calculation below.
>> >
>> >> +
>> >> +    return true;
>> >> +}
>> >> +
>> >> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>> >> +                             double elapsed_time, uint64_t *wait) {
>> >> +    uint64_t iops_limit = 0;
>> >> +    double   ios_limit, ios_disp;
>> >> +    double   slice_time = 0.1, wait_time;
>> >> +
>> >> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> >> +        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
>> >> +    } else if (bs->io_limits.iops[is_write]) {
>> >> +        iops_limit = bs->io_limits.iops[is_write];
>> >> +    } else {
>> >> +        if (wait) {
>> >> +            *wait = 0;
>> >> +        }
>> >> +
>> >> +        return false;
>> >> +    }
>> >> +
>> >> +    ios_limit = iops_limit * slice_time;
>> >> +    ios_disp  = bs->io_disps.ios[is_write];
>> >> +    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> >> +        ios_disp += bs->io_disps.ios[!is_write];
>> >> +    }
>> >> +
>> >> +    if (ios_disp + 1 <= ios_limit) {
>> >> +     if (wait) {
>> >> +         *wait = 0;
>> >> +     }
>> >> +
>> >> +        return false;
>> >> +    }
>> >> +
>> >> +    /* Calc approx time to dispatch */
>> >> +    wait_time = (ios_disp + 1) / iops_limit;
>> >> +    if (wait_time > elapsed_time) {
>> >> +     wait_time = wait_time - elapsed_time;
>> >> +    } else {
>> >> +     wait_time = 0;
>> >> +    }
>> >> +
>> >> +    if (wait) {
>> >> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
>> >> +    }
>> >> +
>> >> +    return true;
>> >> +}
>> >> +
>> >> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>> >> +                           bool is_write, uint64_t *wait) {
>> >> +    int64_t  real_time;
>> >> +    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
>> >> +    double   elapsed_time;
>> >> +    int      bps_ret, iops_ret;
>> >> +
>> >> +    real_time = qemu_get_clock_ns(vm_clock);
>> >> +    if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) {
>> >> +        bs->slice_start[is_write] = real_time;
>> >> +
>> >> +        bs->io_disps.bytes[is_write]   = 0;
>> >> +        bs->io_disps.bytes[!is_write]  = 0;
>> >> +
>> >> +        bs->io_disps.ios[is_write]     = 0;
>> >> +        bs->io_disps.ios[!is_write]    = 0;
>> >> +    }
>> >> +
>> >> +    /* If a limit was exceeded, immediately queue this request */
>> >> +    if ((bs->req_from_queue == false)
>> >> +        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
>> >> +        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
>> >> +            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
>> >> +            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
>> >> +            if (wait) {
>> >> +                *wait = 0;
>> >> +            }
>> >> +
>> >> +            return true;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    elapsed_time  = real_time - bs->slice_start[is_write];
>> >> +    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
>> >> +
>> >> +    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
>> >> +                                      is_write, elapsed_time, &bps_wait);
>> >> +    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
>> >> +                                      elapsed_time, &iops_wait);
>> >> +    if (bps_ret || iops_ret) {
>> >> +        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
>> >> +        if (wait) {
>> >> +            *wait = max_wait;
>> >> +        }
>> >> +
>> >> +        return true;
>> >> +    }
>> >> +
>> >> +    if (wait) {
>> >> +        *wait = 0;
>> >> +    }
>> >> +
>> >> +    return false;
>> >> +}
>> >>
>> >>  /**************************************************************/
>> >>  /* async I/Os */
>> >> @@ -2121,13 +2343,28 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>> >>  {
>> >>      BlockDriver *drv = bs->drv;
>> >>      BlockDriverAIOCB *ret;
>> >> +    uint64_t wait_time = 0;
>> >>
>> >>      trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
>> >>
>> >> -    if (!drv)
>> >> -        return NULL;
>> >> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> >> +    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
>> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +            bs->req_from_queue = false;
>> >> +        }
>> >>          return NULL;
>> >> +    }
>> >> +
>> >> +    /* throttling disk read I/O */
>> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
>> >> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
>> >> +                              sector_num, qiov, nb_sectors, cb, opaque);
>> >> +            qemu_mod_timer(bs->block_timer,
>> >> +                       wait_time + qemu_get_clock_ns(vm_clock));
>> >> +            bs->req_from_queue = false;
>> >> +            return ret;
>> >> +        }
>> >> +    }
>> >>
>> >>      ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
>> >>                                cb, opaque);
>> >> @@ -2136,6 +2373,16 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>> >>       /* Update stats even though technically transfer has not happened. */
>> >>       bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> >>       bs->rd_ops ++;
>> >> +
>> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
>> >> +                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> >> +            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +        bs->req_from_queue = false;
>> >>      }
>> >>
>> >>      return ret;
>> >> @@ -2184,15 +2431,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> >>      BlockDriver *drv = bs->drv;
>> >>      BlockDriverAIOCB *ret;
>> >>      BlockCompleteData *blk_cb_data;
>> >> +    uint64_t wait_time = 0;
>> >>
>> >>      trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
>> >>
>> >> -    if (!drv)
>> >> -        return NULL;
>> >> -    if (bs->read_only)
>> >> -        return NULL;
>> >> -    if (bdrv_check_request(bs, sector_num, nb_sectors))
>> >> +    if (!drv || bs->read_only
>> >> +        || bdrv_check_request(bs, sector_num, nb_sectors)) {
>> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +            bs->req_from_queue = false;
>> >> +        }
>> >> +
>> >>          return NULL;
>> >> +    }
>> >>
>> >>      if (bs->dirty_bitmap) {
>> >>          blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
>> >> @@ -2201,6 +2451,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> >>          opaque = blk_cb_data;
>> >>      }
>> >>
>> >> +    /* throttling disk write I/O */
>> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
>> >> +            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
>> >> +                                  sector_num, qiov, nb_sectors, cb, opaque);
>> >> +            qemu_mod_timer(bs->block_timer,
>> >> +                                  wait_time + qemu_get_clock_ns(vm_clock));
>> >> +            bs->req_from_queue = false;
>> >> +            return ret;
>> >> +        }
>> >> +    }
>> >> +
>> >>      ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
>> >>                                 cb, opaque);
>> >>
>> >> @@ -2211,6 +2473,16 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
>> >>          if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
>> >>              bs->wr_highest_sector = sector_num + nb_sectors - 1;
>> >>          }
>> >> +
>> >> +        if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
>> >> +                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
>> >> +            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>> >> +        bs->req_from_queue = false;
>> >>      }
>> >>
>> >>      return ret;
>> >> diff --git a/block.h b/block.h
>> >> index 859d1d9..f0dac62 100644
>> >> --- a/block.h
>> >> +++ b/block.h
>> >> @@ -97,7 +97,6 @@ int bdrv_change_backing_file(BlockDriverState *bs,
>> >>      const char *backing_file, const char *backing_fmt);
>> >>  void bdrv_register(BlockDriver *bdrv);
>> >>
>> >> -
>> >>  typedef struct BdrvCheckResult {
>> >>      int corruptions;
>> >>      int leaks;
>> >> diff --git a/block/blk-queue.c b/block/blk-queue.c
>> >> new file mode 100644
>> >> index 0000000..09fcfe9
>> >> --- /dev/null
>> >> +++ b/block/blk-queue.c
>> >> @@ -0,0 +1,116 @@
>> >> +/*
>> >> + * QEMU System Emulator queue definition for block layer
>> >> + *
>> >> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
>> >> + *
>> >> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> >> + * of this software and associated documentation files (the "Software"), to deal
>> >> + * in the Software without restriction, including without limitation the rights
>> >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> >> + * copies of the Software, and to permit persons to whom the Software is
>> >> + * furnished to do so, subject to the following conditions:
>> >> + *
>> >> + * The above copyright notice and this permission notice shall be included in
>> >> + * all copies or substantial portions of the Software.
>> >> + *
>> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>> >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> >> + * THE SOFTWARE.
>> >> + */
>> >> +
>> >> +#include "block_int.h"
>> >> +#include "qemu-queue.h"
>> >> +#include "block/blk-queue.h"
>> >> +
>> >> +/* The APIs for block request queue on qemu block layer.
>> >> + */
>> >> +
>> >> +static void qemu_block_queue_cancel(BlockDriverAIOCB *acb)
>> >> +{
>> >> +    qemu_aio_release(acb);
>> >> +}
>> >> +
>> >> +static AIOPool block_queue_pool = {
>> >> +    .aiocb_size         = sizeof(struct BlockDriverAIOCB),
>> >> +    .cancel             = qemu_block_queue_cancel,
>> >> +};
>> >> +
>> >> +static void qemu_block_queue_callback(void *opaque, int ret)
>> >> +{
>> >> +    BlockDriverAIOCB *acb = opaque;
>> >> +
>> >> +    qemu_aio_release(acb);
>> >> +}
>> >> +
>> >> +BlockQueue *qemu_new_block_queue(void)
>> >> +{
>> >> +    BlockQueue *queue;
>> >> +
>> >> +    queue = qemu_mallocz(sizeof(BlockQueue));
>> >> +
>> >> +    QTAILQ_INIT(&queue->requests);
>> >> +
>> >> +    return queue;
>> >> +}
>> >> +
>> >> +void qemu_del_block_queue(BlockQueue *queue)
>> >> +{
>> >> +    BlockIORequest *request, *next;
>> >> +
>> >> +    QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) {
>> >> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>> >> +        qemu_free(request);
>> >> +    }
>> >> +
>> >> +    qemu_free(queue);
>> >> +}
>> >> +
>> >> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
>> >> +                     BlockDriverState *bs,
>> >> +                     BlockRequestHandler *handler,
>> >> +                     int64_t sector_num,
>> >> +                     QEMUIOVector *qiov,
>> >> +                     int nb_sectors,
>> >> +                     BlockDriverCompletionFunc *cb,
>> >> +                     void *opaque)
>> >> +{
>> >> +    BlockIORequest *request;
>> >> +    BlockDriverAIOCB *acb;
>> >> +
>> >> +    request = qemu_malloc(sizeof(BlockIORequest));
>> >> +    request->bs = bs;
>> >> +    request->handler = handler;
>> >> +    request->sector_num = sector_num;
>> >> +    request->qiov = qiov;
>> >> +    request->nb_sectors = nb_sectors;
>> >> +    request->cb = cb;
>> >> +    request->opaque = opaque;
>> >> +
>> >> +    QTAILQ_INSERT_TAIL(&queue->requests, request, entry);
>> >> +
>> >> +    acb = qemu_aio_get(&block_queue_pool, bs,
>> >> +                       qemu_block_queue_callback, opaque);
>> >> +
>> >> +    return acb;
>> >> +}
>> >> +
>> >> +int qemu_block_queue_handler(BlockIORequest *request)
>> >> +{
>> >> +    int ret;
>> >> +    BlockDriverAIOCB *res;
>> >> +
>> >> +    /* indicate this req is from block queue */
>> >> +    request->bs->req_from_queue = true;
>> >> +
>> >> +    res = request->handler(request->bs, request->sector_num,
>> >> +                             request->qiov, request->nb_sectors,
>> >> +                             request->cb, request->opaque);
>> >> +
>> >> +    ret = (res == NULL) ? 0 : 1;
>> >> +
>> >> +    return ret;
>> >> +}
>> >> diff --git a/block/blk-queue.h b/block/blk-queue.h
>> >> new file mode 100644
>> >> index 0000000..47f8a36
>> >> --- /dev/null
>> >> +++ b/block/blk-queue.h
>> >> @@ -0,0 +1,70 @@
>> >> +/*
>> >> + * QEMU System Emulator queue declaration for block layer
>> >> + *
>> >> + * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
>> >> + *
>> >> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> >> + * of this software and associated documentation files (the "Software"), to deal
>> >> + * in the Software without restriction, including without limitation the rights
>> >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> >> + * copies of the Software, and to permit persons to whom the Software is
>> >> + * furnished to do so, subject to the following conditions:
>> >> + *
>> >> + * The above copyright notice and this permission notice shall be included in
>> >> + * all copies or substantial portions of the Software.
>> >> + *
>> >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
>> >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> >> + * THE SOFTWARE.
>> >> + */
>> >> +
>> >> +#ifndef QEMU_BLOCK_QUEUE_H
>> >> +#define QEMU_BLOCK_QUEUE_H
>> >> +
>> >> +#include "block.h"
>> >> +#include "qemu-queue.h"
>> >> +#include "qemu-common.h"
>> >> +
>> >> +typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs,
>> >> +                                int64_t sector_num, QEMUIOVector *qiov,
>> >> +                                int nb_sectors, BlockDriverCompletionFunc *cb,
>> >> +                                void *opaque);
>> >> +
>> >> +struct BlockIORequest {
>> >> +    QTAILQ_ENTRY(BlockIORequest) entry;
>> >> +    BlockDriverState *bs;
>> >> +    BlockRequestHandler *handler;
>> >> +    int64_t sector_num;
>> >> +    QEMUIOVector *qiov;
>> >> +    int nb_sectors;
>> >> +    BlockDriverCompletionFunc *cb;
>> >> +    void *opaque;
>> >> +};
>> >> +
>> >> +typedef struct BlockIORequest BlockIORequest;
>> >> +
>> >> +struct BlockQueue {
>> >> +    QTAILQ_HEAD(requests, BlockIORequest) requests;
>> >> +};
>> >> +
>> >> +typedef struct BlockQueue BlockQueue;
>> >> +
>> >> +BlockQueue *qemu_new_block_queue(void);
>> >> +
>> >> +void qemu_del_block_queue(BlockQueue *queue);
>> >> +
>> >> +BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
>> >> +                        BlockDriverState *bs,
>> >> +                        BlockRequestHandler *handler,
>> >> +                        int64_t sector_num,
>> >> +                        QEMUIOVector *qiov,
>> >> +                        int nb_sectors,
>> >> +                        BlockDriverCompletionFunc *cb,
>> >> +                        void *opaque);
>> >> +
>> >> +int qemu_block_queue_handler(BlockIORequest *request);
>> >> +#endif /* QEMU_BLOCK_QUEUE_H */
>> >> diff --git a/block_int.h b/block_int.h
>> >> index 1e265d2..1587171 100644
>> >> --- a/block_int.h
>> >> +++ b/block_int.h
>> >> @@ -27,10 +27,17 @@
>> >>  #include "block.h"
>> >>  #include "qemu-option.h"
>> >>  #include "qemu-queue.h"
>> >> +#include "block/blk-queue.h"
>> >>
>> >>  #define BLOCK_FLAG_ENCRYPT   1
>> >>  #define BLOCK_FLAG_COMPAT6   4
>> >>
>> >> +#define BLOCK_IO_LIMIT_READ     0
>> >> +#define BLOCK_IO_LIMIT_WRITE    1
>> >> +#define BLOCK_IO_LIMIT_TOTAL    2
>> >> +
>> >> +#define BLOCK_IO_SLICE_TIME     100000000
>> >> +
>> >>  #define BLOCK_OPT_SIZE          "size"
>> >>  #define BLOCK_OPT_ENCRYPT       "encryption"
>> >>  #define BLOCK_OPT_COMPAT6       "compat6"
>> >> @@ -46,6 +53,16 @@ typedef struct AIOPool {
>> >>      BlockDriverAIOCB *free_aiocb;
>> >>  } AIOPool;
>> >>
>> >> +typedef struct BlockIOLimit {
>> >> +    uint64_t bps[3];
>> >> +    uint64_t iops[3];
>> >> +} BlockIOLimit;
>> >> +
>> >> +typedef struct BlockIODisp {
>> >> +    uint64_t bytes[2];
>> >> +    uint64_t ios[2];
>> >> +} BlockIODisp;
>> >> +
>> >>  struct BlockDriver {
>> >>      const char *format_name;
>> >>      int instance_size;
>> >> @@ -175,6 +192,14 @@ struct BlockDriverState {
>> >>
>> >>      void *sync_aiocb;
>> >>
>> >> +    /* the time for latest disk I/O */
>> >> +    int64_t slice_start[2];
>> >> +    BlockIOLimit io_limits;
>> >> +    BlockIODisp  io_disps;
>> >> +    BlockQueue   *block_queue;
>> >> +    QEMUTimer    *block_timer;
>> >> +    bool         req_from_queue;
>> >> +
>> >>      /* I/O stats (display with "info blockstats"). */
>> >>      uint64_t rd_bytes;
>> >>      uint64_t wr_bytes;
>> >> @@ -222,6 +247,9 @@ void qemu_aio_release(void *p);
>> >>
>> >>  void *qemu_blockalign(BlockDriverState *bs, size_t size);
>> >>
>> >> +void bdrv_set_io_limits(BlockDriverState *bs,
>> >> +                            BlockIOLimit *io_limits);
>> >> +
>> >>  #ifdef _WIN32
>> >>  int is_windows_drive(const char *filename);
>> >>  #endif
>> >> diff --git a/blockdev.c b/blockdev.c
>> >> index c263663..45602f4 100644
>> >> --- a/blockdev.c
>> >> +++ b/blockdev.c
>> >> @@ -238,6 +238,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>> >>      int on_read_error, on_write_error;
>> >>      const char *devaddr;
>> >>      DriveInfo *dinfo;
>> >> +    BlockIOLimit io_limits;
>> >> +    bool iol_flag = false;
>> >> +    const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"};
>> >>      int is_extboot = 0;
>> >>      int snapshot = 0;
>> >>      int ret;
>> >> @@ -372,6 +375,19 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>> >>          return NULL;
>> >>      }
>> >>
>> >> +    /* disk io limits */
>> >> +    iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts);
>> >> +    if (iol_flag) {
>> >> +        memset(&io_limits, 0, sizeof(BlockIOLimit));
>> >> +
>> >> +        io_limits.bps[2]  = qemu_opt_get_number(opts, "bps", 0);
>> >> +        io_limits.bps[0]  = qemu_opt_get_number(opts, "bps_rd", 0);
>> >> +        io_limits.bps[1]  = qemu_opt_get_number(opts, "bps_wr", 0);
>> >> +        io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0);
>> >> +        io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0);
>> >> +        io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0);
>> >> +    }
>> >> +
>> >>      on_write_error = BLOCK_ERR_STOP_ENOSPC;
>> >>      if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
>> >>          if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
>> >> @@ -483,6 +499,11 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
>> >>
>> >>      bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
>> >>
>> >> +    /* throttling disk io limits */
>> >> +    if (iol_flag) {
>> >> +        bdrv_set_io_limits(dinfo->bdrv, &io_limits);
>> >> +    }
>> >> +
>> >>      switch(type) {
>> >>      case IF_IDE:
>> >>      case IF_SCSI:
>> >> diff --git a/qemu-config.c b/qemu-config.c
>> >> index efa892c..9232bbb 100644
>> >> --- a/qemu-config.c
>> >> +++ b/qemu-config.c
>> >> @@ -82,6 +82,30 @@ static QemuOptsList qemu_drive_opts = {
>> >>              .name = "boot",
>> >>              .type = QEMU_OPT_BOOL,
>> >>              .help = "make this a boot drive",
>> >> +        },{
>> >> +            .name = "iops",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "limit total I/O operations per second",
>> >> +        },{
>> >> +            .name = "iops_rd",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "limit read operations per second",
>> >> +        },{
>> >> +            .name = "iops_wr",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "limit write operations per second",
>> >> +        },{
>> >> +            .name = "bps",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "limit total bytes per second",
>> >> +        },{
>> >> +            .name = "bps_rd",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "limit read bytes per second",
>> >> +        },{
>> >> +            .name = "bps_wr",
>> >> +            .type = QEMU_OPT_NUMBER,
>> >> +            .help = "limit write bytes per second",
>> >>          },
>> >>          { /* end of list */ }
>> >>      },
>> >> diff --git a/qemu-option.c b/qemu-option.c
>> >> index 65db542..9fe234d 100644
>> >> --- a/qemu-option.c
>> >> +++ b/qemu-option.c
>> >> @@ -562,6 +562,23 @@ uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
>> >>      return opt->value.uint;
>> >>  }
>> >>
>> >> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts)
>> >> +{
>> >> +     int i;
>> >> +     uint64_t opt_val   = 0;
>> >> +     bool iol_flag = false;
>> >> +
>> >> +     for (i = 0; iol_opts[i]; i++) {
>> >> +      opt_val = qemu_opt_get_number(opts, iol_opts[i], 0);
>> >> +      if (opt_val != 0) {
>> >> +          iol_flag = true;
>> >> +          break;
>> >> +      }
>> >> +     }
>> >> +
>> >> +     return iol_flag;
>> >> +}
>> >> +
>> >>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
>> >>  {
>> >>      QemuOpt *opt = qemu_opt_find(opts, name);
>> >> diff --git a/qemu-option.h b/qemu-option.h
>> >> index b515813..fc909f9 100644
>> >> --- a/qemu-option.h
>> >> +++ b/qemu-option.h
>> >> @@ -107,6 +107,7 @@ struct QemuOptsList {
>> >>  const char *qemu_opt_get(QemuOpts *opts, const char *name);
>> >>  int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval);
>> >>  uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval);
>> >> +bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts);
>> >>  uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval);
>> >>  int qemu_opt_set(QemuOpts *opts, const char *name, const char *value);
>> >>  typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque);
>> >> diff --git a/qemu-options.hx b/qemu-options.hx
>> >> index cb3347e..ae219f5 100644
>> >> --- a/qemu-options.hx
>> >> +++ b/qemu-options.hx
>> >> @@ -121,6 +121,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>> >>      "       [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
>> >>      "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
>> >>      "       [,readonly=on|off][,boot=on|off]\n"
>> >> +    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
>> >>      "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>> >>  STEXI
>> >>  @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
>> >> --
>> >> 1.7.2.3
>> >>
>> >> --
>> >> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >> the body of a message to majordomo@vger.kernel.org
>> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> >
>>
>>
>>
>> --
>> Regards,
>>
>> Zhi Yong Wu
>
Zhiyong Wu - July 28, 2011, 5:43 a.m.
On Wed, Jul 27, 2011 at 8:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Jul 27, 2011 at 11:17 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>>>> Welcome to give me your comments, thanks.
>>>>
>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>> ---
>>>>  Makefile.objs     |    2 +-
>>>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>  block.h           |    1 -
>>>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>>>  block/blk-queue.h |   70 +++++++++++++
>>>>  block_int.h       |   28 +++++
>>>>  blockdev.c        |   21 ++++
>>>>  qemu-config.c     |   24 +++++
>>>>  qemu-option.c     |   17 +++
>>>>  qemu-option.h     |    1 +
>>>>  qemu-options.hx   |    1 +
>>>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>>>  create mode 100644 block/blk-queue.c
>>>>  create mode 100644 block/blk-queue.h
>>>>
>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>> index 9f99ed4..06f2033 100644
>>>> --- a/Makefile.objs
>>>> +++ b/Makefile.objs
>>>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>>  block-nested-y += qed-check.o
>>>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>>>  block-nested-$(CONFIG_CURL) += curl.o
>>>> diff --git a/block.c b/block.c
>>>> index 24a25d5..e54e59c 100644
>>>> --- a/block.c
>>>> +++ b/block.c
>>>> @@ -29,6 +29,9 @@
>>>>  #include "module.h"
>>>>  #include "qemu-objects.h"
>>>>
>>>> +#include "qemu-timer.h"
>>>> +#include "block/blk-queue.h"
>>>> +
>>>>  #ifdef CONFIG_BSD
>>>>  #include <sys/types.h>
>>>>  #include <sys/stat.h>
>>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>>                           const uint8_t *buf, int nb_sectors);
>>>>
>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>> +        double elapsed_time, uint64_t *wait);
>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>> +        bool is_write, uint64_t *wait);
>>>> +
>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>
>>>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>>>  }
>>>>  #endif
>>>>
>>>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>>>> +{
>>>> +    if ((io_limits->bps[0] == 0)
>>>> +         && (io_limits->bps[1] == 0)
>>>> +         && (io_limits->bps[2] == 0)
>>>> +         && (io_limits->iops[0] == 0)
>>>> +         && (io_limits->iops[1] == 0)
>>>> +         && (io_limits->iops[2] == 0)) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    return 1;
>>>> +}
>>>> +
>>>>  /* check if the path starts with "<protocol>:" */
>>>>  static int path_has_protocol(const char *path)
>>>>  {
>>>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>>>      }
>>>>  }
>>>>
>>>> +static void bdrv_block_timer(void *opaque)
>>>> +{
>>>> +    BlockDriverState *bs = opaque;
>>>> +    BlockQueue *queue = bs->block_queue;
>>>> +
>>>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>>>> +        BlockIORequest *request;
>>>> +        int ret;
>>>> +
>>>> +        request = QTAILQ_FIRST(&queue->requests);
>>>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>>>> +
>>>> +        ret = qemu_block_queue_handler(request);
>>>> +        if (ret == 0) {
>>>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        qemu_free(request);
>>>> +    }
>>>> +}
>>>> +
>>>>  void bdrv_register(BlockDriver *bdrv)
>>>>  {
>>>>      if (!bdrv->bdrv_aio_readv) {
>>>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>>>      }
>>>>
>>>> +    /* throttling disk I/O limits */
>>>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>>>> +        bs->block_queue = qemu_new_block_queue();
>>>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>> +
>>>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>>>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>>>> +    }
>>>> +
>>>
>>> It should be possible to tune the limits on the flight, please introduce
>>> QMP commands for that.
>> Yeah, I am working on this.
>
> It's worth mentioning that the I/O limits commands can use Supriya's
> new block_set command for changing block device parameters at runtime.
>  So I think the runtime limits changing can be a separate patch.
Since I/O limits commands will depend on Supriya's block_set patch,
when will her patch be merged into qemu.git?

>
> Stefan
>
Stefan Hajnoczi - July 28, 2011, 8:20 a.m.
On Thu, Jul 28, 2011 at 6:43 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Wed, Jul 27, 2011 at 8:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Wed, Jul 27, 2011 at 11:17 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>>> On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>>>>> Welcome to give me your comments, thanks.
>>>>>
>>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>>> ---
>>>>>  Makefile.objs     |    2 +-
>>>>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>>  block.h           |    1 -
>>>>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>>>>  block/blk-queue.h |   70 +++++++++++++
>>>>>  block_int.h       |   28 +++++
>>>>>  blockdev.c        |   21 ++++
>>>>>  qemu-config.c     |   24 +++++
>>>>>  qemu-option.c     |   17 +++
>>>>>  qemu-option.h     |    1 +
>>>>>  qemu-options.hx   |    1 +
>>>>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>>>>  create mode 100644 block/blk-queue.c
>>>>>  create mode 100644 block/blk-queue.h
>>>>>
>>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>>> index 9f99ed4..06f2033 100644
>>>>> --- a/Makefile.objs
>>>>> +++ b/Makefile.objs
>>>>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>>>  block-nested-y += qed-check.o
>>>>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>>>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>>>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>>>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>>>>  block-nested-$(CONFIG_CURL) += curl.o
>>>>> diff --git a/block.c b/block.c
>>>>> index 24a25d5..e54e59c 100644
>>>>> --- a/block.c
>>>>> +++ b/block.c
>>>>> @@ -29,6 +29,9 @@
>>>>>  #include "module.h"
>>>>>  #include "qemu-objects.h"
>>>>>
>>>>> +#include "qemu-timer.h"
>>>>> +#include "block/blk-queue.h"
>>>>> +
>>>>>  #ifdef CONFIG_BSD
>>>>>  #include <sys/types.h>
>>>>>  #include <sys/stat.h>
>>>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>>>                           const uint8_t *buf, int nb_sectors);
>>>>>
>>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>>> +        double elapsed_time, uint64_t *wait);
>>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>>> +        bool is_write, uint64_t *wait);
>>>>> +
>>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>>
>>>>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>>>>  }
>>>>>  #endif
>>>>>
>>>>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>>>>> +{
>>>>> +    if ((io_limits->bps[0] == 0)
>>>>> +         && (io_limits->bps[1] == 0)
>>>>> +         && (io_limits->bps[2] == 0)
>>>>> +         && (io_limits->iops[0] == 0)
>>>>> +         && (io_limits->iops[1] == 0)
>>>>> +         && (io_limits->iops[2] == 0)) {
>>>>> +        return 0;
>>>>> +    }
>>>>> +
>>>>> +    return 1;
>>>>> +}
>>>>> +
>>>>>  /* check if the path starts with "<protocol>:" */
>>>>>  static int path_has_protocol(const char *path)
>>>>>  {
>>>>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>>>>      }
>>>>>  }
>>>>>
>>>>> +static void bdrv_block_timer(void *opaque)
>>>>> +{
>>>>> +    BlockDriverState *bs = opaque;
>>>>> +    BlockQueue *queue = bs->block_queue;
>>>>> +
>>>>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>>>>> +        BlockIORequest *request;
>>>>> +        int ret;
>>>>> +
>>>>> +        request = QTAILQ_FIRST(&queue->requests);
>>>>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>>>>> +
>>>>> +        ret = qemu_block_queue_handler(request);
>>>>> +        if (ret == 0) {
>>>>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>>>>> +            break;
>>>>> +        }
>>>>> +
>>>>> +        qemu_free(request);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>  void bdrv_register(BlockDriver *bdrv)
>>>>>  {
>>>>>      if (!bdrv->bdrv_aio_readv) {
>>>>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>>>>      }
>>>>>
>>>>> +    /* throttling disk I/O limits */
>>>>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>>>>> +        bs->block_queue = qemu_new_block_queue();
>>>>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>>> +
>>>>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>>>>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>>>>> +    }
>>>>> +
>>>>
>>>> It should be possible to tune the limits on the flight, please introduce
>>>> QMP commands for that.
>>> Yeah, I am working on this.
>>
>> It's worth mentioning that the I/O limits commands can use Supriya's
>> new block_set command for changing block device parameters at runtime.
>>  So I think the runtime limits changing can be a separate patch.
> Since I/O limits commands will depend on Supriya's block_set patch,
> when will her patch be merged into qemu.git?

The block_set patch is on the mailing list with an ongoing discussion.
 Until agreement is reached it will not be merged - hard to tell when
exactly that will be, but hopefully over the next few days.

You can always add temporary QMP/HMP commands in order to test setting
limits at runtime.  It should be easy to switch to a different QMP/HMP
command later, if necessary.

Stefan
Stefan Hajnoczi - July 28, 2011, 8:25 a.m.
On Thu, Jul 28, 2011 at 9:20 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Jul 28, 2011 at 6:43 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Wed, Jul 27, 2011 at 8:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>> On Wed, Jul 27, 2011 at 11:17 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>>>> On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>>> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>>>>>> Welcome to give me your comments, thanks.
>>>>>>
>>>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>>>> ---
>>>>>>  Makefile.objs     |    2 +-
>>>>>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>>>  block.h           |    1 -
>>>>>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>>>>>  block/blk-queue.h |   70 +++++++++++++
>>>>>>  block_int.h       |   28 +++++
>>>>>>  blockdev.c        |   21 ++++
>>>>>>  qemu-config.c     |   24 +++++
>>>>>>  qemu-option.c     |   17 +++
>>>>>>  qemu-option.h     |    1 +
>>>>>>  qemu-options.hx   |    1 +
>>>>>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>>>>>  create mode 100644 block/blk-queue.c
>>>>>>  create mode 100644 block/blk-queue.h
>>>>>>
>>>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>>>> index 9f99ed4..06f2033 100644
>>>>>> --- a/Makefile.objs
>>>>>> +++ b/Makefile.objs
>>>>>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>>>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>>>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>>>>  block-nested-y += qed-check.o
>>>>>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>>>>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>>>>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>>>>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>>>>>  block-nested-$(CONFIG_CURL) += curl.o
>>>>>> diff --git a/block.c b/block.c
>>>>>> index 24a25d5..e54e59c 100644
>>>>>> --- a/block.c
>>>>>> +++ b/block.c
>>>>>> @@ -29,6 +29,9 @@
>>>>>>  #include "module.h"
>>>>>>  #include "qemu-objects.h"
>>>>>>
>>>>>> +#include "qemu-timer.h"
>>>>>> +#include "block/blk-queue.h"
>>>>>> +
>>>>>>  #ifdef CONFIG_BSD
>>>>>>  #include <sys/types.h>
>>>>>>  #include <sys/stat.h>
>>>>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>>>>                           const uint8_t *buf, int nb_sectors);
>>>>>>
>>>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>>>> +        double elapsed_time, uint64_t *wait);
>>>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>>>> +        bool is_write, uint64_t *wait);
>>>>>> +
>>>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>>>
>>>>>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>>>>>  }
>>>>>>  #endif
>>>>>>
>>>>>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>>>>>> +{
>>>>>> +    if ((io_limits->bps[0] == 0)
>>>>>> +         && (io_limits->bps[1] == 0)
>>>>>> +         && (io_limits->bps[2] == 0)
>>>>>> +         && (io_limits->iops[0] == 0)
>>>>>> +         && (io_limits->iops[1] == 0)
>>>>>> +         && (io_limits->iops[2] == 0)) {
>>>>>> +        return 0;
>>>>>> +    }
>>>>>> +
>>>>>> +    return 1;
>>>>>> +}
>>>>>> +
>>>>>>  /* check if the path starts with "<protocol>:" */
>>>>>>  static int path_has_protocol(const char *path)
>>>>>>  {
>>>>>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>>>>>      }
>>>>>>  }
>>>>>>
>>>>>> +static void bdrv_block_timer(void *opaque)
>>>>>> +{
>>>>>> +    BlockDriverState *bs = opaque;
>>>>>> +    BlockQueue *queue = bs->block_queue;
>>>>>> +
>>>>>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>>>>>> +        BlockIORequest *request;
>>>>>> +        int ret;
>>>>>> +
>>>>>> +        request = QTAILQ_FIRST(&queue->requests);
>>>>>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>>>>>> +
>>>>>> +        ret = qemu_block_queue_handler(request);
>>>>>> +        if (ret == 0) {
>>>>>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>>>>>> +            break;
>>>>>> +        }
>>>>>> +
>>>>>> +        qemu_free(request);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>>  void bdrv_register(BlockDriver *bdrv)
>>>>>>  {
>>>>>>      if (!bdrv->bdrv_aio_readv) {
>>>>>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>>>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>>>>>      }
>>>>>>
>>>>>> +    /* throttling disk I/O limits */
>>>>>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>>>>>> +        bs->block_queue = qemu_new_block_queue();
>>>>>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>>>> +
>>>>>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>>>>>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>>>>>> +    }
>>>>>> +
>>>>>
>>>>> It should be possible to tune the limits on the flight, please introduce
>>>>> QMP commands for that.
>>>> Yeah, I am working on this.
>>>
>>> It's worth mentioning that the I/O limits commands can use Supriya's
>>> new block_set command for changing block device parameters at runtime.
>>>  So I think the runtime limits changing can be a separate patch.
>> Since I/O limits commands will depend on Supriya's block_set patch,
>> when will her patch be merged into qemu.git?
>
> The block_set patch is on the mailing list with an ongoing discussion.
>  Until agreement is reached it will not be merged - hard to tell when
> exactly that will be, but hopefully over the next few days.
>
> You can always add temporary QMP/HMP commands in order to test setting
> limits at runtime.  It should be easy to switch to a different QMP/HMP
> command later, if necessary.

Just to clarify: adding temporary QMP/HMP commands in your personal
git repo can be useful for development and testing.  It will let you
continue fleshing out things like block device hotplug.  Of course
qemu.git can only merge commands that we intend to support for the
future, so a final QMP/HMP interface needs to be written at some point
but it does not stop you from making progress.

Stefan
Zhiyong Wu - July 28, 2011, 8:50 a.m.
On Thu, Jul 28, 2011 at 4:25 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Thu, Jul 28, 2011 at 9:20 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> On Thu, Jul 28, 2011 at 6:43 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>>> On Wed, Jul 27, 2011 at 8:58 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>>>> On Wed, Jul 27, 2011 at 11:17 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>>>>> On Wed, Jul 27, 2011 at 3:26 AM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>>>>>> On Tue, Jul 26, 2011 at 04:59:06PM +0800, Zhi Yong Wu wrote:
>>>>>>> Welcome to give me your comments, thanks.
>>>>>>>
>>>>>>> Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
>>>>>>> ---
>>>>>>>  Makefile.objs     |    2 +-
>>>>>>>  block.c           |  288 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>>>>  block.h           |    1 -
>>>>>>>  block/blk-queue.c |  116 +++++++++++++++++++++
>>>>>>>  block/blk-queue.h |   70 +++++++++++++
>>>>>>>  block_int.h       |   28 +++++
>>>>>>>  blockdev.c        |   21 ++++
>>>>>>>  qemu-config.c     |   24 +++++
>>>>>>>  qemu-option.c     |   17 +++
>>>>>>>  qemu-option.h     |    1 +
>>>>>>>  qemu-options.hx   |    1 +
>>>>>>>  11 files changed, 559 insertions(+), 10 deletions(-)
>>>>>>>  create mode 100644 block/blk-queue.c
>>>>>>>  create mode 100644 block/blk-queue.h
>>>>>>>
>>>>>>> diff --git a/Makefile.objs b/Makefile.objs
>>>>>>> index 9f99ed4..06f2033 100644
>>>>>>> --- a/Makefile.objs
>>>>>>> +++ b/Makefile.objs
>>>>>>> @@ -23,7 +23,7 @@ block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
>>>>>>>  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
>>>>>>>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>>>>>>>  block-nested-y += qed-check.o
>>>>>>> -block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
>>>>>>> +block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
>>>>>>>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>>>>>>>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>>>>>>>  block-nested-$(CONFIG_CURL) += curl.o
>>>>>>> diff --git a/block.c b/block.c
>>>>>>> index 24a25d5..e54e59c 100644
>>>>>>> --- a/block.c
>>>>>>> +++ b/block.c
>>>>>>> @@ -29,6 +29,9 @@
>>>>>>>  #include "module.h"
>>>>>>>  #include "qemu-objects.h"
>>>>>>>
>>>>>>> +#include "qemu-timer.h"
>>>>>>> +#include "block/blk-queue.h"
>>>>>>> +
>>>>>>>  #ifdef CONFIG_BSD
>>>>>>>  #include <sys/types.h>
>>>>>>>  #include <sys/stat.h>
>>>>>>> @@ -58,6 +61,13 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>>>>>>>  static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>>>>>>>                           const uint8_t *buf, int nb_sectors);
>>>>>>>
>>>>>>> +static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>>>>>>> +        bool is_write, double elapsed_time, uint64_t *wait);
>>>>>>> +static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>>>>>>> +        double elapsed_time, uint64_t *wait);
>>>>>>> +static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>>>>> +        bool is_write, uint64_t *wait);
>>>>>>> +
>>>>>>>  static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>>>>>>>      QTAILQ_HEAD_INITIALIZER(bdrv_states);
>>>>>>>
>>>>>>> @@ -90,6 +100,20 @@ int is_windows_drive(const char *filename)
>>>>>>>  }
>>>>>>>  #endif
>>>>>>>
>>>>>>> +static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
>>>>>>> +{
>>>>>>> +    if ((io_limits->bps[0] == 0)
>>>>>>> +         && (io_limits->bps[1] == 0)
>>>>>>> +         && (io_limits->bps[2] == 0)
>>>>>>> +         && (io_limits->iops[0] == 0)
>>>>>>> +         && (io_limits->iops[1] == 0)
>>>>>>> +         && (io_limits->iops[2] == 0)) {
>>>>>>> +        return 0;
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return 1;
>>>>>>> +}
>>>>>>> +
>>>>>>>  /* check if the path starts with "<protocol>:" */
>>>>>>>  static int path_has_protocol(const char *path)
>>>>>>>  {
>>>>>>> @@ -167,6 +191,28 @@ void path_combine(char *dest, int dest_size,
>>>>>>>      }
>>>>>>>  }
>>>>>>>
>>>>>>> +static void bdrv_block_timer(void *opaque)
>>>>>>> +{
>>>>>>> +    BlockDriverState *bs = opaque;
>>>>>>> +    BlockQueue *queue = bs->block_queue;
>>>>>>> +
>>>>>>> +    while (!QTAILQ_EMPTY(&queue->requests)) {
>>>>>>> +        BlockIORequest *request;
>>>>>>> +        int ret;
>>>>>>> +
>>>>>>> +        request = QTAILQ_FIRST(&queue->requests);
>>>>>>> +        QTAILQ_REMOVE(&queue->requests, request, entry);
>>>>>>> +
>>>>>>> +        ret = qemu_block_queue_handler(request);
>>>>>>> +        if (ret == 0) {
>>>>>>> +            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
>>>>>>> +            break;
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        qemu_free(request);
>>>>>>> +    }
>>>>>>> +}
>>>>>>> +
>>>>>>>  void bdrv_register(BlockDriver *bdrv)
>>>>>>>  {
>>>>>>>      if (!bdrv->bdrv_aio_readv) {
>>>>>>> @@ -642,6 +688,15 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
>>>>>>>              bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
>>>>>>>      }
>>>>>>>
>>>>>>> +    /* throttling disk I/O limits */
>>>>>>> +    if (bdrv_io_limits_enable(&bs->io_limits)) {
>>>>>>> +        bs->block_queue = qemu_new_block_queue();
>>>>>>> +        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
>>>>>>> +
>>>>>>> +        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
>>>>>>> +        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
>>>>>>> +    }
>>>>>>> +
>>>>>>
>>>>>> It should be possible to tune the limits on the flight, please introduce
>>>>>> QMP commands for that.
>>>>> Yeah, I am working on this.
>>>>
>>>> It's worth mentioning that the I/O limits commands can use Supriya's
>>>> new block_set command for changing block device parameters at runtime.
>>>>  So I think the runtime limits changing can be a separate patch.
>>> Since I/O limits commands will depend on Supriya's block_set patch,
>>> when will her patch be merged into qemu.git?
>>
>> The block_set patch is on the mailing list with an ongoing discussion.
>>  Until agreement is reached it will not be merged - hard to tell when
>> exactly that will be, but hopefully over the next few days.
>>
>> You can always add temporary QMP/HMP commands in order to test setting
>> limits at runtime.  It should be easy to switch to a different QMP/HMP
>> command later, if necessary.
>
> Just to clarify: adding temporary QMP/HMP commands in your personal
> git repo can be useful for development and testing.  It will let you
> continue fleshing out things like block device hotplug.  Of course
> qemu.git can only merge commands that we intend to support for the
> future, so a final QMP/HMP interface needs to be written at some point
> but it does not stop you from making progress.
No matter. no pain, no gain. by adding temporary QMP/HMP commands, i
can learn about block device hotplug. I think that it is worth doing
it.

>
> Stefan
>
Marcelo Tosatti - July 28, 2011, 2:42 p.m.
On Thu, Jul 28, 2011 at 12:24:48PM +0800, Zhi Yong Wu wrote:
> On Wed, Jul 27, 2011 at 11:49 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Wed, Jul 27, 2011 at 06:17:15PM +0800, Zhi Yong Wu wrote:
> >> >> +        wait_time = 1;
> >> >> +    }
> >> >> +
> >> >> +    wait_time = wait_time + (slice_time - elapsed_time);
> >> >> +    if (wait) {
> >> >> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
> >> >> +    }
> >> >
> >> > The guest can keep submitting requests where "wait_time = 1" above,
> >> > and the timer will be rearmed continuously in the future.
> >
> > This is wrong.
> >
> >>  Can't you
> >> > simply arm the timer to the next slice start? _Some_ data must be
> >> > transfered by then, anyway (and nothing can be transfered earlier than
> >> > that).
> >
> > This is valid.
> >
> >> Sorry, i have got what you mean. Can you elaborate in more detail?
> >
> > Sorry, the bug i mentioned about timer being rearmed does not exist.
> >
> > But arming the timer for the last request as its done is confusing/unnecessary.
> >
> > The timer processes queued requests, but the timer is armed accordingly
> > to the last queued request in the slice. For example, if a request is
> > submitted 1ms before the slice ends, the timer is armed 100ms +
> > (slice_time - elapsed_time) in the future.
> 
> If the timer is simply amred to the next slice start, this timer will
> be a periodic timer, either the I/O rate can not be throttled under
> the limits, or the enqueued request can be delayed to handled, this
> will lower I/O rate seriously than the limits.

Yes, periodic but disarmed when there is no queueing. I don't understand
your point about low I/O rate.

> Maybe the slice time should be variable with current I/O rate. What do
> you think of it?

Not sure if its necessary. The slice should be small to avoid excessive
work on timer context, but the advantage of increasing the slice is
very small if any. BTW, 10ms seems a better default than 100ms.
Zhiyong Wu - July 29, 2011, 2:09 a.m.
On Thu, Jul 28, 2011 at 10:42 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Thu, Jul 28, 2011 at 12:24:48PM +0800, Zhi Yong Wu wrote:
>> On Wed, Jul 27, 2011 at 11:49 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Wed, Jul 27, 2011 at 06:17:15PM +0800, Zhi Yong Wu wrote:
>> >> >> +        wait_time = 1;
>> >> >> +    }
>> >> >> +
>> >> >> +    wait_time = wait_time + (slice_time - elapsed_time);
>> >> >> +    if (wait) {
>> >> >> +        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
>> >> >> +    }
>> >> >
>> >> > The guest can keep submitting requests where "wait_time = 1" above,
>> >> > and the timer will be rearmed continuously in the future.
>> >
>> > This is wrong.
>> >
>> >>  Can't you
>> >> > simply arm the timer to the next slice start? _Some_ data must be
>> >> > transfered by then, anyway (and nothing can be transfered earlier than
>> >> > that).
>> >
>> > This is valid.
>> >
>> >> Sorry, i have got what you mean. Can you elaborate in more detail?
>> >
>> > Sorry, the bug i mentioned about timer being rearmed does not exist.
>> >
>> > But arming the timer for the last request as its done is confusing/unnecessary.
>> >
>> > The timer processes queued requests, but the timer is armed accordingly
>> > to the last queued request in the slice. For example, if a request is
>> > submitted 1ms before the slice ends, the timer is armed 100ms +
>> > (slice_time - elapsed_time) in the future.
>>
>> If the timer is simply amred to the next slice start, this timer will
>> be a periodic timer, either the I/O rate can not be throttled under
>> the limits, or the enqueued request can be delayed to handled, this
>> will lower I/O rate seriously than the limits.
>
> Yes, periodic but disarmed when there is no queueing. I don't understand
> your point about low I/O rate.
We hope that at the same time the runtime I/O rate is throttled under
this limits, it can be very close to the limits. If the timer is
simpily armed to the next slice, the enqueued request could be delayed
a bit long to be handled, this could make current I/O rate lowerer
largely than the limits. So this could seriously hurt the I/O
performance.

>
>> Maybe the slice time should be variable with current I/O rate. What do
>> you think of it?
>
> Not sure if its necessary. The slice should be small to avoid excessive
> work on timer context, but the advantage of increasing the slice is
> very small if any. BTW, 10ms seems a better default than 100ms.
Thanks for your comments, if you're interested, pls take a look at the
code V3. It has added the codes for variable slice time.

>
>

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 9f99ed4..06f2033 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,7 +23,7 @@  block-nested-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vv
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
-block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o blk-queue.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block.c b/block.c
index 24a25d5..e54e59c 100644
--- a/block.c
+++ b/block.c
@@ -29,6 +29,9 @@ 
 #include "module.h"
 #include "qemu-objects.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -58,6 +61,13 @@  static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, uint64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -90,6 +100,20 @@  int is_windows_drive(const char *filename)
 }
 #endif
 
+static int bdrv_io_limits_enable(BlockIOLimit *io_limits)
+{
+    if ((io_limits->bps[0] == 0)
+         && (io_limits->bps[1] == 0)
+         && (io_limits->bps[2] == 0)
+         && (io_limits->iops[0] == 0)
+         && (io_limits->iops[1] == 0)
+         && (io_limits->iops[2] == 0)) {
+        return 0;
+    }
+
+    return 1;
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -167,6 +191,28 @@  void path_combine(char *dest, int dest_size,
     }
 }
 
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    BlockQueue *queue = bs->block_queue;
+
+    while (!QTAILQ_EMPTY(&queue->requests)) {
+        BlockIORequest *request;
+        int ret;
+
+        request = QTAILQ_FIRST(&queue->requests);
+        QTAILQ_REMOVE(&queue->requests, request, entry);
+
+        ret = qemu_block_queue_handler(request);
+        if (ret == 0) {
+            QTAILQ_INSERT_HEAD(&queue->requests, request, entry);
+            break;
+        }
+
+        qemu_free(request);
+    }
+}
+
 void bdrv_register(BlockDriver *bdrv)
 {
     if (!bdrv->bdrv_aio_readv) {
@@ -642,6 +688,15 @@  int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bdrv_io_limits_enable(&bs->io_limits)) {
+        bs->block_queue = qemu_new_block_queue();
+        bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+
+        bs->slice_start[0] = qemu_get_clock_ns(vm_clock);
+        bs->slice_start[1] = qemu_get_clock_ns(vm_clock);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -680,6 +735,16 @@  void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+    }
 }
 
 void bdrv_close_all(void)
@@ -1312,6 +1377,14 @@  void bdrv_get_geometry_hint(BlockDriverState *bs,
     *psecs = bs->secs;
 }
 
+/* throttling disk io limits */
+void bdrv_set_io_limits(BlockDriverState *bs,
+			    BlockIOLimit *io_limits)
+{
+    memset(&bs->io_limits, 0, sizeof(BlockIOLimit));
+    bs->io_limits = *io_limits;
+}
+
 /* Recognize floppy formats */
 typedef struct FDFormat {
     FDriveType drive;
@@ -2111,6 +2184,155 @@  char *bdrv_snapshot_dump(char *buf, int buf_size, QEMUSnapshotInfo *sn)
     return buf;
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time = 0.1, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    bytes_limit	= bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res - bytes_limit) / bps_limit;
+    if (!wait_time) {
+        wait_time = 1;
+    }
+
+    wait_time = wait_time + (slice_time - elapsed_time);
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time = 0.1, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    ios_limit = iops_limit * slice_time;
+    ios_disp  = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+	if (wait) {
+	    *wait = 0;
+	}
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+	wait_time = wait_time - elapsed_time;
+    } else {
+	wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10 + 1;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, uint64_t *wait) {
+    int64_t  real_time;
+    uint64_t bps_wait = 0, iops_wait = 0, max_wait;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    real_time = qemu_get_clock_ns(vm_clock);
+    if (bs->slice_start[is_write] + BLOCK_IO_SLICE_TIME <= real_time) {
+        bs->slice_start[is_write] = real_time;
+
+        bs->io_disps.bytes[is_write]   = 0;
+        bs->io_disps.bytes[!is_write]  = 0;
+
+        bs->io_disps.ios[is_write]     = 0;
+        bs->io_disps.ios[!is_write]    = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if ((bs->req_from_queue == false)
+        && !QTAILQ_EMPTY(&bs->block_queue->requests)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = 0;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = real_time - bs->slice_start[is_write];
+    elapsed_time  /= (BLOCK_IO_SLICE_TIME * 10.0);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async I/Os */
@@ -2121,13 +2343,28 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
 {
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
+    uint64_t wait_time = 0;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
+        if (bdrv_io_limits_enable(&bs->io_limits)) {
+            bs->req_from_queue = false;
+        }
         return NULL;
+    }
+
+    /* throttling disk read I/O */
+    if (bdrv_io_limits_enable(&bs->io_limits)) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                              sector_num, qiov, nb_sectors, cb, opaque);
+            qemu_mod_timer(bs->block_timer,
+                       wait_time + qemu_get_clock_ns(vm_clock));
+            bs->req_from_queue = false;
+            return ret;
+        }
+    }
 
     ret = drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                               cb, opaque);
@@ -2136,6 +2373,16 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
 	/* Update stats even though technically transfer has not happened. */
 	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
 	bs->rd_ops ++;
+
+        if (bdrv_io_limits_enable(&bs->io_limits)) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    if (bdrv_io_limits_enable(&bs->io_limits)) {
+        bs->req_from_queue = false;
     }
 
     return ret;
@@ -2184,15 +2431,18 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    uint64_t wait_time = 0;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
+        if (bdrv_io_limits_enable(&bs->io_limits)) {
+            bs->req_from_queue = false;
+        }
+
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2201,6 +2451,18 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bdrv_io_limits_enable(&bs->io_limits)) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            qemu_mod_timer(bs->block_timer,
+                                  wait_time + qemu_get_clock_ns(vm_clock));
+            bs->req_from_queue = false;
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
 
@@ -2211,6 +2473,16 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bdrv_io_limits_enable(&bs->io_limits)) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
+    }
+
+    if (bdrv_io_limits_enable(&bs->io_limits)) {
+        bs->req_from_queue = false;
     }
 
     return ret;
diff --git a/block.h b/block.h
index 859d1d9..f0dac62 100644
--- a/block.h
+++ b/block.h
@@ -97,7 +97,6 @@  int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
diff --git a/block/blk-queue.c b/block/blk-queue.c
new file mode 100644
index 0000000..09fcfe9
--- /dev/null
+++ b/block/blk-queue.c
@@ -0,0 +1,116 @@ 
+/*
+ * QEMU System Emulator queue definition for block layer
+ *
+ * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "block_int.h"
+#include "qemu-queue.h"
+#include "block/blk-queue.h"
+
+/* The APIs for block request queue on qemu block layer.
+ */
+
+static void qemu_block_queue_cancel(BlockDriverAIOCB *acb)
+{
+    qemu_aio_release(acb);
+}
+
+static AIOPool block_queue_pool = {
+    .aiocb_size         = sizeof(struct BlockDriverAIOCB),
+    .cancel             = qemu_block_queue_cancel,
+};
+
+static void qemu_block_queue_callback(void *opaque, int ret)
+{
+    BlockDriverAIOCB *acb = opaque;
+
+    qemu_aio_release(acb);
+}
+
+BlockQueue *qemu_new_block_queue(void)
+{
+    BlockQueue *queue;
+
+    queue = qemu_mallocz(sizeof(BlockQueue));
+
+    QTAILQ_INIT(&queue->requests);
+
+    return queue;
+}
+
+void qemu_del_block_queue(BlockQueue *queue)
+{
+    BlockIORequest *request, *next;
+
+    QTAILQ_FOREACH_SAFE(request, &queue->requests, entry, next) {
+        QTAILQ_REMOVE(&queue->requests, request, entry);
+        qemu_free(request);
+    }
+
+    qemu_free(queue);
+}
+
+BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
+			BlockDriverState *bs,
+			BlockRequestHandler *handler,
+			int64_t sector_num,
+			QEMUIOVector *qiov,
+			int nb_sectors,
+			BlockDriverCompletionFunc *cb,
+			void *opaque)
+{
+    BlockIORequest *request;
+    BlockDriverAIOCB *acb;
+
+    request = qemu_malloc(sizeof(BlockIORequest));
+    request->bs = bs;
+    request->handler = handler;
+    request->sector_num = sector_num;
+    request->qiov = qiov;
+    request->nb_sectors = nb_sectors;
+    request->cb = cb;
+    request->opaque = opaque;
+
+    QTAILQ_INSERT_TAIL(&queue->requests, request, entry);
+
+    acb = qemu_aio_get(&block_queue_pool, bs,
+                       qemu_block_queue_callback, opaque);
+
+    return acb;
+}
+
+int qemu_block_queue_handler(BlockIORequest *request)
+{
+    int ret;
+    BlockDriverAIOCB *res;
+
+    /* indicate this req is from block queue */
+    request->bs->req_from_queue = true;
+
+    res = request->handler(request->bs, request->sector_num,
+        	                request->qiov, request->nb_sectors,
+				request->cb, request->opaque);
+
+    ret = (res == NULL) ? 0 : 1; 
+
+    return ret;
+}
diff --git a/block/blk-queue.h b/block/blk-queue.h
new file mode 100644
index 0000000..47f8a36
--- /dev/null
+++ b/block/blk-queue.h
@@ -0,0 +1,70 @@ 
+/*
+ * QEMU System Emulator queue declaration for block layer
+ *
+ * Copyright (c) 2011 Zhi Yong Wu  <zwu.kernel@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_BLOCK_QUEUE_H
+#define QEMU_BLOCK_QUEUE_H
+
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+
+typedef BlockDriverAIOCB* (BlockRequestHandler) (BlockDriverState *bs,
+                                int64_t sector_num, QEMUIOVector *qiov,
+                                int nb_sectors, BlockDriverCompletionFunc *cb,
+                                void *opaque);
+
+struct BlockIORequest {
+    QTAILQ_ENTRY(BlockIORequest) entry;
+    BlockDriverState *bs;
+    BlockRequestHandler *handler;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+};
+
+typedef struct BlockIORequest BlockIORequest;
+
+struct BlockQueue {
+    QTAILQ_HEAD(requests, BlockIORequest) requests;
+};
+
+typedef struct BlockQueue BlockQueue;
+
+BlockQueue *qemu_new_block_queue(void);
+
+void qemu_del_block_queue(BlockQueue *queue);
+
+BlockDriverAIOCB *qemu_block_queue_enqueue(BlockQueue *queue,
+                        BlockDriverState *bs,
+                        BlockRequestHandler *handler,
+                        int64_t sector_num,
+                        QEMUIOVector *qiov,
+                        int nb_sectors,
+                        BlockDriverCompletionFunc *cb,
+                        void *opaque);
+
+int qemu_block_queue_handler(BlockIORequest *request);
+#endif /* QEMU_BLOCK_QUEUE_H */
diff --git a/block_int.h b/block_int.h
index 1e265d2..1587171 100644
--- a/block_int.h
+++ b/block_int.h
@@ -27,10 +27,17 @@ 
 #include "block.h"
 #include "qemu-option.h"
 #include "qemu-queue.h"
+#include "block/blk-queue.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
 
+#define BLOCK_IO_LIMIT_READ     0
+#define BLOCK_IO_LIMIT_WRITE    1
+#define BLOCK_IO_LIMIT_TOTAL    2
+
+#define BLOCK_IO_SLICE_TIME     100000000
+
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
 #define BLOCK_OPT_COMPAT6       "compat6"
@@ -46,6 +53,16 @@  typedef struct AIOPool {
     BlockDriverAIOCB *free_aiocb;
 } AIOPool;
 
+typedef struct BlockIOLimit {
+    uint64_t bps[3];
+    uint64_t iops[3];
+} BlockIOLimit;
+
+typedef struct BlockIODisp {
+    uint64_t bytes[2];
+    uint64_t ios[2];
+} BlockIODisp;
+
 struct BlockDriver {
     const char *format_name;
     int instance_size;
@@ -175,6 +192,14 @@  struct BlockDriverState {
 
     void *sync_aiocb;
 
+    /* the time for latest disk I/O */
+    int64_t slice_start[2];
+    BlockIOLimit io_limits;
+    BlockIODisp  io_disps;
+    BlockQueue   *block_queue;
+    QEMUTimer    *block_timer;
+    bool         req_from_queue;
+
     /* I/O stats (display with "info blockstats"). */
     uint64_t rd_bytes;
     uint64_t wr_bytes;
@@ -222,6 +247,9 @@  void qemu_aio_release(void *p);
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 
+void bdrv_set_io_limits(BlockDriverState *bs,
+                            BlockIOLimit *io_limits);
+
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
diff --git a/blockdev.c b/blockdev.c
index c263663..45602f4 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -238,6 +238,9 @@  DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     int on_read_error, on_write_error;
     const char *devaddr;
     DriveInfo *dinfo;
+    BlockIOLimit io_limits;
+    bool iol_flag = false;
+    const char *iol_opts[7] = {"bps", "bps_rd", "bps_wr", "iops", "iops_rd", "iops_wr"};
     int is_extboot = 0;
     int snapshot = 0;
     int ret;
@@ -372,6 +375,19 @@  DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         return NULL;
     }
 
+    /* disk io limits */
+    iol_flag = qemu_opt_io_limits_enable_flag(opts, iol_opts);
+    if (iol_flag) {
+        memset(&io_limits, 0, sizeof(BlockIOLimit));
+
+        io_limits.bps[2]  = qemu_opt_get_number(opts, "bps", 0);
+        io_limits.bps[0]  = qemu_opt_get_number(opts, "bps_rd", 0);
+        io_limits.bps[1]  = qemu_opt_get_number(opts, "bps_wr", 0);
+        io_limits.iops[2] = qemu_opt_get_number(opts, "iops", 0);
+        io_limits.iops[0] = qemu_opt_get_number(opts, "iops_rd", 0);
+        io_limits.iops[1] = qemu_opt_get_number(opts, "iops_wr", 0);
+    }
+
     on_write_error = BLOCK_ERR_STOP_ENOSPC;
     if ((buf = qemu_opt_get(opts, "werror")) != NULL) {
         if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) {
@@ -483,6 +499,11 @@  DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
 
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
 
+    /* throttling disk io limits */
+    if (iol_flag) {
+        bdrv_set_io_limits(dinfo->bdrv, &io_limits);
+    }
+
     switch(type) {
     case IF_IDE:
     case IF_SCSI:
diff --git a/qemu-config.c b/qemu-config.c
index efa892c..9232bbb 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -82,6 +82,30 @@  static QemuOptsList qemu_drive_opts = {
             .name = "boot",
             .type = QEMU_OPT_BOOL,
             .help = "make this a boot drive",
+        },{
+            .name = "iops",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit total I/O operations per second",
+        },{
+            .name = "iops_rd",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit read operations per second",
+        },{
+            .name = "iops_wr",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit write operations per second",
+        },{
+            .name = "bps",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit total bytes per second",
+        },{
+            .name = "bps_rd",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit read bytes per second",
+        },{
+            .name = "bps_wr",
+            .type = QEMU_OPT_NUMBER,
+            .help = "limit write bytes per second",
         },
         { /* end of list */ }
     },
diff --git a/qemu-option.c b/qemu-option.c
index 65db542..9fe234d 100644
--- a/qemu-option.c
+++ b/qemu-option.c
@@ -562,6 +562,23 @@  uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
     return opt->value.uint;
 }
 
+bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts)
+{
+     int i;
+     uint64_t opt_val   = 0;
+     bool iol_flag = false;
+
+     for (i = 0; iol_opts[i]; i++) {
+	 opt_val = qemu_opt_get_number(opts, iol_opts[i], 0);
+	 if (opt_val != 0) {
+	     iol_flag = true;
+	     break;
+	 }
+     }
+
+     return iol_flag;
+}
+
 uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
 {
     QemuOpt *opt = qemu_opt_find(opts, name);
diff --git a/qemu-option.h b/qemu-option.h
index b515813..fc909f9 100644
--- a/qemu-option.h
+++ b/qemu-option.h
@@ -107,6 +107,7 @@  struct QemuOptsList {
 const char *qemu_opt_get(QemuOpts *opts, const char *name);
 int qemu_opt_get_bool(QemuOpts *opts, const char *name, int defval);
 uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval);
+bool qemu_opt_io_limits_enable_flag(QemuOpts *opts, const char **iol_opts);
 uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval);
 int qemu_opt_set(QemuOpts *opts, const char *name, const char *value);
 typedef int (*qemu_opt_loopfunc)(const char *name, const char *value, void *opaque);
diff --git a/qemu-options.hx b/qemu-options.hx
index cb3347e..ae219f5 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -121,6 +121,7 @@  DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [,cache=writethrough|writeback|none|unsafe][,format=f]\n"
     "       [,serial=s][,addr=A][,id=name][,aio=threads|native]\n"
     "       [,readonly=on|off][,boot=on|off]\n"
+    "       [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]