Patchwork [v7,3/4] block: add block timer and throttling algorithm

login
register
mail settings
Submitter Zhi Yong Wu
Date Sept. 6, 2011, 10:48 a.m.
Message ID <1315306101-3129-4-git-send-email-wuzhy@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/113538/
State New
Headers show

Comments

Zhi Yong Wu - Sept. 6, 2011, 10:48 a.m.
Note:
     1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
     2.) When "dd" command is issued in guest, if its option bs is set to a large value such as "bs=1024K", the result speed will slightly bigger than the limits.

For these problems, if you have nice thought, pls let us know.:)

Signed-off-by: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
---
 block.c    |  297 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 block.h    |    6 +-
 blockdev.c |   69 ++++++++++++++
 3 files changed, 361 insertions(+), 11 deletions(-)

Patch

diff --git a/block.c b/block.c
index 17ee3df..1f6f188 100644
--- a/block.c
+++ b/block.c
@@ -30,6 +30,9 @@ 
 #include "qemu-objects.h"
 #include "qemu-coroutine.h"
 
+#include "qemu-timer.h"
+#include "block/blk-queue.h"
+
 #ifdef CONFIG_BSD
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -72,6 +75,13 @@  static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          QEMUIOVector *iov);
 static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+        double elapsed_time, uint64_t *wait);
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+        bool is_write, int64_t *wait);
+
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
@@ -104,6 +114,57 @@  int is_windows_drive(const char *filename)
 }
 #endif
 
+/* throttling disk I/O limits */
+void bdrv_io_limits_disable(BlockDriverState *bs)
+{
+    bs->io_limits_enabled = false;
+
+    if (bs->block_queue) {
+        qemu_block_queue_flush(bs->block_queue);
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer     = NULL;
+    }
+
+    bs->slice_start = 0;
+
+    bs->slice_end   = 0;
+}
+
+static void bdrv_block_timer(void *opaque)
+{
+    BlockDriverState *bs = opaque;
+    BlockQueue *queue    = bs->block_queue;
+
+    qemu_block_queue_flush(queue);
+}
+
+void bdrv_io_limits_enable(BlockDriverState *bs)
+{
+    bs->block_queue = qemu_new_block_queue();
+    bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);
+
+    bs->slice_start = qemu_get_clock_ns(vm_clock);
+
+    bs->slice_end   = bs->slice_start + BLOCK_IO_SLICE_TIME;
+}
+
+bool bdrv_io_limits_enabled(BlockDriverState *bs)
+{
+    BlockIOLimit *io_limits = &bs->io_limits;
+    return io_limits->bps[BLOCK_IO_LIMIT_READ]
+         || io_limits->bps[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->bps[BLOCK_IO_LIMIT_TOTAL]
+         || io_limits->iops[BLOCK_IO_LIMIT_READ]
+         || io_limits->iops[BLOCK_IO_LIMIT_WRITE]
+         || io_limits->iops[BLOCK_IO_LIMIT_TOTAL];
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -694,6 +755,11 @@  int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
 
+    /* throttling disk I/O limits */
+    if (bs->io_limits_enabled) {
+        bdrv_io_limits_enable(bs);
+    }
+
     return 0;
 
 unlink_and_fail:
@@ -732,6 +798,18 @@  void bdrv_close(BlockDriverState *bs)
         if (bs->change_cb)
             bs->change_cb(bs->change_opaque, CHANGE_MEDIA);
     }
+
+    /* throttling disk I/O limits */
+    if (bs->block_queue) {
+        qemu_del_block_queue(bs->block_queue);
+        bs->block_queue = NULL;
+    }
+
+    if (bs->block_timer) {
+        qemu_del_timer(bs->block_timer);
+        qemu_free_timer(bs->block_timer);
+        bs->block_timer = NULL;
+    }
 }
 
 void bdrv_close_all(void)
@@ -2290,16 +2368,40 @@  BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                                  BlockDriverCompletionFunc *cb, void *opaque)
 {
     BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
-    return drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
+    /* throttling disk read I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, false, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_readv,
+                           sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
+    ret =  drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
+    if (ret) {
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_READ] +=
+                              (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_READ]++;
+        }
+    }
+
+    return ret;
 }
 
 typedef struct BlockCompleteData {
@@ -2345,15 +2447,14 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
     BlockDriver *drv = bs->drv;
     BlockDriverAIOCB *ret;
     BlockCompleteData *blk_cb_data;
+    int64_t wait_time = -1;
 
     trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque);
 
-    if (!drv)
-        return NULL;
-    if (bs->read_only)
-        return NULL;
-    if (bdrv_check_request(bs, sector_num, nb_sectors))
+    if (!drv || bs->read_only
+        || bdrv_check_request(bs, sector_num, nb_sectors)) {
         return NULL;
+    }
 
     if (bs->dirty_bitmap) {
         blk_cb_data = blk_dirty_cb_alloc(bs, sector_num, nb_sectors, cb,
@@ -2362,13 +2463,32 @@  BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num,
         opaque = blk_cb_data;
     }
 
+    /* throttling disk write I/O */
+    if (bs->io_limits_enabled) {
+        if (bdrv_exceed_io_limits(bs, nb_sectors, true, &wait_time)) {
+            ret = qemu_block_queue_enqueue(bs->block_queue, bs, bdrv_aio_writev,
+                                  sector_num, qiov, nb_sectors, cb, opaque);
+            if (wait_time != -1) {
+                qemu_mod_timer(bs->block_timer,
+                               wait_time + qemu_get_clock_ns(vm_clock));
+            }
+
+            return ret;
+        }
+    }
+
     ret = drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
                                cb, opaque);
-
     if (ret) {
         if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
             bs->wr_highest_sector = sector_num + nb_sectors - 1;
         }
+
+        if (bs->io_limits_enabled) {
+            bs->io_disps.bytes[BLOCK_IO_LIMIT_WRITE] +=
+                               (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+            bs->io_disps.ios[BLOCK_IO_LIMIT_WRITE]++;
+        }
     }
 
     return ret;
@@ -2633,6 +2753,163 @@  void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     acb->pool->cancel(acb);
 }
 
+static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
+                 bool is_write, double elapsed_time, uint64_t *wait) {
+    uint64_t bps_limit = 0;
+    double   bytes_limit, bytes_disp, bytes_res;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.bps[is_write]) {
+        bps_limit = bs->io_limits.bps[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    bytes_limit = bps_limit * slice_time;
+    bytes_disp  = bs->io_disps.bytes[is_write];
+    if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
+        bytes_disp += bs->io_disps.bytes[!is_write];
+    }
+
+    bytes_res   = (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+
+    if (bytes_disp + bytes_res <= bytes_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
+                             double elapsed_time, uint64_t *wait) {
+    uint64_t iops_limit = 0;
+    double   ios_limit, ios_disp;
+    double   slice_time, wait_time;
+
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL];
+    } else if (bs->io_limits.iops[is_write]) {
+        iops_limit = bs->io_limits.iops[is_write];
+    } else {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    slice_time = bs->slice_end - bs->slice_start;
+    slice_time /= (NANOSECONDS_PER_SECOND);
+    ios_limit  = iops_limit * slice_time;
+    ios_disp   = bs->io_disps.ios[is_write];
+    if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+        ios_disp += bs->io_disps.ios[!is_write];
+    }
+
+    if (ios_disp + 1 <= ios_limit) {
+        if (wait) {
+            *wait = 0;
+        }
+
+        return false;
+    }
+
+    /* Calc approx time to dispatch */
+    wait_time = (ios_disp + 1) / iops_limit;
+    if (wait_time > elapsed_time) {
+        wait_time = wait_time - elapsed_time;
+    } else {
+        wait_time = 0;
+    }
+
+    if (wait) {
+        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+    }
+
+    return true;
+}
+
+static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
+                           bool is_write, int64_t *wait) {
+    int64_t  now, max_wait;
+    uint64_t bps_wait = 0, iops_wait = 0;
+    double   elapsed_time;
+    int      bps_ret, iops_ret;
+
+    now = qemu_get_clock_ns(vm_clock);
+    if ((bs->slice_start < now)
+        && (bs->slice_end > now)) {
+        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+    } else {
+        bs->slice_start = now;
+        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+
+        bs->io_disps.bytes[is_write]  = 0;
+        bs->io_disps.bytes[!is_write] = 0;
+
+        bs->io_disps.ios[is_write]    = 0;
+        bs->io_disps.ios[!is_write]   = 0;
+    }
+
+    /* If a limit was exceeded, immediately queue this request */
+    if (qemu_block_queue_has_pending(bs->block_queue)) {
+        if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]
+            || bs->io_limits.bps[is_write] || bs->io_limits.iops[is_write]
+            || bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
+            if (wait) {
+                *wait = -1;
+            }
+
+            return true;
+        }
+    }
+
+    elapsed_time  = now - bs->slice_start;
+    elapsed_time  /= (NANOSECONDS_PER_SECOND);
+
+    bps_ret  = bdrv_exceed_bps_limits(bs, nb_sectors,
+                                      is_write, elapsed_time, &bps_wait);
+    iops_ret = bdrv_exceed_iops_limits(bs, is_write,
+                                      elapsed_time, &iops_wait);
+    if (bps_ret || iops_ret) {
+        max_wait = bps_wait > iops_wait ? bps_wait : iops_wait;
+        if (wait) {
+            *wait = max_wait;
+        }
+
+        now = qemu_get_clock_ns(vm_clock);
+        if (bs->slice_end < now + max_wait) {
+            bs->slice_end = now + max_wait;
+        }
+
+        return true;
+    }
+
+    if (wait) {
+        *wait = 0;
+    }
+
+    return false;
+}
 
 /**************************************************************/
 /* async block device emulation */
diff --git a/block.h b/block.h
index 3ac0b94..10d2828 100644
--- a/block.h
+++ b/block.h
@@ -58,6 +58,11 @@  void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
+/* disk I/O throttling */
+void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_disable(BlockDriverState *bs);
+bool bdrv_io_limits_enabled(BlockDriverState *bs);
+
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
 BlockDriver *bdrv_find_protocol(const char *filename);
@@ -102,7 +107,6 @@  int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
 
-
 typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
diff --git a/blockdev.c b/blockdev.c
index 619ae9f..7f5c4df 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -747,6 +747,75 @@  int do_change_block(Monitor *mon, const char *device,
     return monitor_read_bdrv_key_start(mon, bs, NULL, NULL);
 }
 
+/* throttling disk I/O limits */
+int do_block_set_io_throttle(Monitor *mon,
+                       const QDict *qdict, QObject **ret_data)
+{
+    const char *devname = qdict_get_str(qdict, "device");
+    uint64_t bps        = qdict_get_try_int(qdict, "bps", -1);
+    uint64_t bps_rd     = qdict_get_try_int(qdict, "bps_rd", -1);
+    uint64_t bps_wr     = qdict_get_try_int(qdict, "bps_wr", -1);
+    uint64_t iops       = qdict_get_try_int(qdict, "iops", -1);
+    uint64_t iops_rd    = qdict_get_try_int(qdict, "iops_rd", -1);
+    uint64_t iops_wr    = qdict_get_try_int(qdict, "iops_wr", -1);
+    BlockDriverState *bs;
+
+    bs = bdrv_find(devname);
+    if (!bs) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, devname);
+        return -1;
+    }
+
+    if ((bps == -1) && (bps_rd == -1) && (bps_wr == -1)
+        && (iops == -1) && (iops_rd == -1) && (iops_wr == -1)) {
+        qerror_report(QERR_MISSING_PARAMETER,
+                      "bps/bps_rd/bps_wr/iops/iops_rd/iops_wr");
+        return -1;
+    }
+
+    if (((bps != -1) && ((bps_rd != -1) || (bps_wr != -1)))
+        || ((iops != -1) && ((iops_rd != -1) || (iops_wr != -1)))) {
+        qerror_report(QERR_INVALID_PARAMETER_COMBINATION);
+        return -1;
+    }
+
+    if (bps != -1) {
+        bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = bps;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_READ]  = 0;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] = 0;
+    }
+
+    if ((bps_rd != -1) || (bps_wr != -1)) {
+        bs->io_limits.bps[BLOCK_IO_LIMIT_READ]   =
+           (bps_rd == -1) ? bs->io_limits.bps[BLOCK_IO_LIMIT_READ] : bps_rd;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]  =
+           (bps_wr == -1) ? bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE] : bps_wr;
+        bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]  = 0;
+    }
+
+    if (iops != -1) {
+        bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] = iops;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_READ]  = 0;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] = 0;
+    }
+
+    if ((iops_rd != -1) || (iops_wr != -1)) {
+        bs->io_limits.iops[BLOCK_IO_LIMIT_READ]  =
+           (iops_rd == -1) ? bs->io_limits.iops[BLOCK_IO_LIMIT_READ] : iops_rd;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] =
+           (iops_wr == -1) ? bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] : iops_wr;
+        bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL] = 0;
+    }
+
+    if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_enable(bs);
+    } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {
+        bdrv_io_limits_disable(bs);
+    }
+
+    return 0;
+}
+
 int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     const char *id = qdict_get_str(qdict, "id");