Patchwork [v8,3/4] block: add block timer and throttling algorithm

login
register
mail settings
Submitter Zhiyong Wu
Date Sept. 21, 2011, 7:03 a.m.
Message ID <CAEH94Ljs6nO+KCTcM=F=yXOTW6OpLQdxNUsFc38He1QhH9uEcA@mail.gmail.com>
Download mbox | patch
Permalink /patch/115691/
State New
Headers show

Comments

Zhiyong Wu - Sept. 21, 2011, 7:03 a.m.
On Tue, Sep 20, 2011 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Mon, Sep 19, 2011 at 05:55:41PM +0800, Zhi Yong Wu wrote:
>> On Wed, Sep 14, 2011 at 6:50 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> > On Tue, Sep 13, 2011 at 11:09:46AM +0800, Zhi Yong Wu wrote:
>> >> On Fri, Sep 9, 2011 at 10:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> > On Thu, Sep 08, 2011 at 06:11:07PM +0800, Zhi Yong Wu wrote:
>> >> >> Note:
>> >> >>      1.) When bps/iops limits are specified to a small value such as 511 bytes/s, this VM will hang up. We are considering how to handle this senario.
>> >> >
>> >> > You can increase the length of the slice, if the request is larger than
>> >> > slice_time * bps_limit.
>> >> Yeah, but it is a challenge for how to increase it. Do you have some nice idea?
>> >
>> > If the queue is empty, and the request being processed does not fit the
>> > queue, increase the slice so that the request fits.
>> Sorry for late reply. actually, do you think that this scenario is
>> meaningful for the user?
>> Since we implement this, if the user limits the bps below 512
>> bytes/second, the VM can also not run every task.
>> Can you let us know why we need to make such effort?
>
> It would be good to handle request larger than the slice.
Below is the code changes for your way. I used simple trace and did dd
test on guest, then found only the first rw req is handled, and
subsequent reqs are enqueued. After several minutes, guest prints the
info below on its terminal:
INFO: task kdmflush:326 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.

I don't make sure if it is correct. Do you have better way to verify it?

>
> It is not strictly necessary, but in case its not handled, a minimum
> should be in place, to reflect maximum request size known. Being able to
> specify something which crashes is not acceptable.
>
>

Patch

diff --git a/block.c b/block.c
index af19784..f88c22a 100644
--- a/block.c
+++ b/block.c
@@ -132,9 +132,10 @@  void bdrv_io_limits_disable(BlockDriverState *bs)
         bs->block_timer     = NULL;
     }

-    bs->slice_start = 0;
-
-    bs->slice_end   = 0;
+    bs->slice_time    = 0;
+    bs->slice_start   = 0;
+    bs->slice_end     = 0;
+    bs->first_time_rw = false;
 }

 static void bdrv_block_timer(void *opaque)
@@ -151,9 +152,10 @@  void bdrv_io_limits_enable(BlockDriverState *bs)
     bs->block_queue = qemu_new_block_queue();
     bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs);

+    bs->slice_time  = BLOCK_IO_SLICE_TIME;
     bs->slice_start = qemu_get_clock_ns(vm_clock);
-
-    bs->slice_end   = bs->slice_start + BLOCK_IO_SLICE_TIME;
+    bs->slice_end   = bs->slice_start + bs->slice_time;
+    bs->first_time_rw = true;
 }

 bool bdrv_io_limits_enabled(BlockDriverState *bs)
@@ -2846,11 +2848,23 @@  static bool
bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
     /* Calc approx time to dispatch */
     wait_time = (bytes_disp + bytes_res) / bps_limit - elapsed_time;

-    if (wait) {
-        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
-    }
+    if (!bs->first_time_rw
+        || !qemu_block_queue_is_empty(bs->block_queue)) {
+        if (wait) {
+            *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        }

-    return true;
+        return true;
+    } else {
+        bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        bs->slice_end += bs->slice_time - BLOCK_IO_SLICE_TIME;
+        if (wait) {
+            *wait = 0;
+        }
+
+        bs->first_time_rw = false;
+        return false;
+    }
 }

 static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
@@ -2895,11 +2909,23 @@  static bool
bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
         wait_time = 0;
     }

-    if (wait) {
-        *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
-    }
+    if (!bs->first_time_rw
+        || !qemu_block_queue_is_empty(bs->block_queue)) {
+        if (wait) {
+            *wait = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        }

-    return true;
+        return true;
+    } else {
+        bs->slice_time = wait_time * BLOCK_IO_SLICE_TIME * 10;
+        bs->slice_end += bs->slice_time - BLOCK_IO_SLICE_TIME;
+        if (wait) {
+            *wait = 0;
+        }
+
+        bs->first_time_rw = false;
+        return false;
+    }
 }

 static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
@@ -2912,10 +2938,10 @@  static bool
bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
     now = qemu_get_clock_ns(vm_clock);
     if ((bs->slice_start < now)
         && (bs->slice_end > now)) {
-        bs->slice_end = now + BLOCK_IO_SLICE_TIME;
+        bs->slice_end = now + bs->slice_time;
     } else {
         bs->slice_start = now;
-        bs->slice_end   = now + BLOCK_IO_SLICE_TIME;
+        bs->slice_end   = now + bs->slice_time;

         bs->io_disps.bytes[is_write]  = 0;
         bs->io_disps.bytes[!is_write] = 0;
diff --git a/block/blk-queue.c b/block/blk-queue.c
index adef497..04e52ad 100644
--- a/block/blk-queue.c
+++ b/block/blk-queue.c
@@ -199,3 +199,8 @@  bool qemu_block_queue_has_pending(BlockQueue *queue)
 {
     return !queue->flushing && !QTAILQ_EMPTY(&queue->requests);
 }
+
+bool qemu_block_queue_is_empty(BlockQueue *queue)
+{
+    return QTAILQ_EMPTY(&queue->requests);
+}
diff --git a/block/blk-queue.h b/block/blk-queue.h
index c1529f7..d3b379b 100644
--- a/block/blk-queue.h
+++ b/block/blk-queue.h
@@ -56,4 +56,6 @@  void qemu_block_queue_flush(BlockQueue *queue);

 bool qemu_block_queue_has_pending(BlockQueue *queue);

+bool qemu_block_queue_is_empty(BlockQueue *queue);
+
 #endif /* QEMU_BLOCK_QUEUE_H */
diff --git a/block_int.h b/block_int.h
index 93c0d56..5eb007d 100644
--- a/block_int.h
+++ b/block_int.h
@@ -199,6 +199,7 @@  struct BlockDriverState {
     void *sync_aiocb;

     /* the time for latest disk I/O */
+    int64_t slice_time;
     int64_t slice_start;
     int64_t slice_end;
     BlockIOLimit io_limits;
@@ -206,6 +207,7 @@  struct BlockDriverState {
     BlockQueue   *block_queue;
     QEMUTimer    *block_timer;
     bool         io_limits_enabled;
+    bool         first_time_rw;

     /* I/O stats (display with "info blockstats"). */
     uint64_t nr_bytes[BDRV_MAX_IOTYPE];
diff --git a/blockdev.c b/blockdev.c
index 63bd2b5..67d5a50 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -782,6 +782,8 @@  int do_block_set_io_throttle(Monitor *mon,
     bs->io_limits.iops[BLOCK_IO_LIMIT_READ]  = iops_rd;
     bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE] = iops_wr;

+    bs->slice_time = BLOCK_IO_SLICE_TIME;
+
     if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) {
         bdrv_io_limits_enable(bs);
     } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) {