diff mbox

[4/7] throttle: Add throttle group support

Message ID 3060954d506be499c26d07ba4624dd33cd7b002d.1427732020.git.berto@igalia.com
State New
Headers show

Commit Message

Alberto Garcia March 30, 2015, 4:19 p.m. UTC
The throttle group support use a cooperative round robin scheduling
algorithm.

The principles of the algorithm are simple:
- Each BDS of the group is used as a token in a circular way.
- The active BDS computes if a wait must be done and arms the right
  timer.
- If a wait must be done the token timer will be armed so the token
  will become the next active BDS.

Signed-off-by: Alberto Garcia <berto@igalia.com>
---
 block.c                         |  83 +++++-----------
 block/qapi.c                    |   5 +-
 block/throttle-groups.c         | 207 +++++++++++++++++++++++++++++++++++++++-
 blockdev.c                      |  22 ++++-
 hmp.c                           |   4 +-
 include/block/block.h           |   3 +-
 include/block/block_int.h       |   7 +-
 include/block/throttle-groups.h |   4 +
 qapi/block-core.json            |   4 +-
 qemu-options.hx                 |   1 +
 qmp-commands.hx                 |   3 +-
 11 files changed, 271 insertions(+), 72 deletions(-)

Comments

Fam Zheng April 1, 2015, 2:44 p.m. UTC | #1
On Mon, 03/30 19:19, Alberto Garcia wrote:
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index 7873084..d8211b7 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -990,6 +990,8 @@
>  #
>  # @iops_size: #optional an I/O size in bytes (Since 1.7)
>  #
> +# @group: #optional throttle group name (Since 2.3)

We should probably elaborate (here, and at other places of @group appearances):
@device is used as group name. This is useful since other devices could use
this device name as the group name, for exmple the following -drive definition:

...
-drive file=foo,id=foo,bps=100 \
-drive file=bar,id=foo,bps=200,group=foo \
...

will put both devices into the same group.

Also, as the two bps values don't match, I assume the last one is used? Is
there a warning when ignoring a config from previous definitions?

Thinking about this, I'd slightly prefer a canonical throttle group definition
rather than patching the existing parameters:

-object throttle-group,id=tg0,bps=100,iops=200,iops-max=1000 \
-drive file=foo,id=foo,throttle-group=tg0 \
-drive file=bar,id=bar,throttle-group=tg0 \

and error out if "bps=" etc are specified together with "throttle-group=" in
-drive.

And in QMP, add block_set_io_throttle_group which works together with
object-add.

What do you think?

> +#
>  # Returns: Nothing on success
>  #          If @device is not a valid block device, DeviceNotFound
>  #
> @@ -1001,7 +1003,7 @@
>              '*bps_max': 'int', '*bps_rd_max': 'int',
>              '*bps_wr_max': 'int', '*iops_max': 'int',
>              '*iops_rd_max': 'int', '*iops_wr_max': 'int',
> -            '*iops_size': 'int' } }
> +            '*iops_size': 'int', '*group': 'str' } }
>  
>  ##
>  # @block-stream:
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 319d971..7d6b2ec 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -464,6 +464,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
>      "       [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n"
>      "       [[,iops_max=im]|[[,iops_rd_max=irm][,iops_wr_max=iwm]]]\n"
>      "       [[,iops_size=is]]\n"
> +    "       [[,group=g]]\n"
>      "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
>  STEXI
>  @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
> diff --git a/qmp-commands.hx b/qmp-commands.hx
> index 3a42ad0..ce897ff 100644
> --- a/qmp-commands.hx
> +++ b/qmp-commands.hx
> @@ -1730,7 +1730,7 @@ EQMP
>  
>      {
>          .name       = "block_set_io_throttle",
> -        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l,bps_max:l?,bps_rd_max:l?,bps_wr_max:l?,iops_max:l?,iops_rd_max:l?,iops_wr_max:l?,iops_size:l?",
> +        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l,bps_max:l?,bps_rd_max:l?,bps_wr_max:l?,iops_max:l?,iops_rd_max:l?,iops_wr_max:l?,iops_size:l?,group:s?",
>          .mhandler.cmd_new = qmp_marshal_input_block_set_io_throttle,
>      },
>  
> @@ -1756,6 +1756,7 @@ Arguments:
>  - "iops_rd_max":  read I/O operations max (json-int)
>  - "iops_wr_max":  write I/O operations max (json-int)
>  - "iops_size":  I/O size in bytes when limiting (json-int)

Why removing these three?

> +- "group": throttle group name (json-string)
>  
>  Example:
>  
> -- 
> 2.1.4
> 
>
Alberto Garcia April 1, 2015, 3:18 p.m. UTC | #2
On Wed, Apr 01, 2015 at 10:44:51PM +0800, Fam Zheng wrote:

> > +# @group: #optional throttle group name (Since 2.3)
> 
> We should probably elaborate (here, and at other places of @group
> appearances): @device is used as group name. This is useful since
> other devices could use this device name as the group name, for
> exmple the following -drive definition:
> 
> ...
> -drive file=foo,id=foo,bps=100 \
> -drive file=bar,id=foo,bps=200,group=foo \
> ...
> 
> will put both devices into the same group.

That's right, thanks for pointing it out. I can update the
documentation. Alternatively we could use some other mechanism for
auto-generating group names, rather than using the name of the device,
but either way I think it deserves being documented.

> Also, as the two bps values don't match, I assume the last one
> is used? Is there a warning when ignoring a config from previous
> definitions?

Everytime a new device is added to a group it sets new values for the
whole group, because values are not attached to any particular device.
For the same reason if one device is removed the rest will continue
working with the existing configuration.

> Thinking about this, I'd slightly prefer a canonical throttle group
> definition rather than patching the existing parameters:
> 
> -object throttle-group,id=tg0,bps=100,iops=200,iops-max=1000 \
> -drive file=foo,id=foo,throttle-group=tg0 \
> -drive file=bar,id=bar,throttle-group=tg0 \
> 
> and error out if "bps=" etc are specified together with
> "throttle-group=" in -drive.
> 
> And in QMP, add block_set_io_throttle_group which works together
> with object-add.

And then you would create a group automatically for drives that have
bps= specified? What would happen if all members of the group are
removed? And what would happen if it's a group created with -object?

My first impression is that the idea is feasible, but I'm unsure at
the moment of its complexity or possible side effects.

> > @@ -1756,6 +1756,7 @@ Arguments:
> >  - "iops_rd_max":  read I/O operations max (json-int)
> >  - "iops_wr_max":  write I/O operations max (json-int)
> >  - "iops_size":  I/O size in bytes when limiting (json-int)
> 
> Why removing these three?

There's nothing removed, that's just a list of bullets, note the
column where the dashes appear :)

Berto
Fam Zheng April 2, 2015, 3:26 a.m. UTC | #3
On Wed, 04/01 17:18, Alberto Garcia wrote:
> On Wed, Apr 01, 2015 at 10:44:51PM +0800, Fam Zheng wrote:
> 
> > > +# @group: #optional throttle group name (Since 2.3)
> > 
> > We should probably elaborate (here, and at other places of @group
> > appearances): @device is used as group name. This is useful since
> > other devices could use this device name as the group name, for
> > exmple the following -drive definition:
> > 
> > ...
> > -drive file=foo,id=foo,bps=100 \
> > -drive file=bar,id=foo,bps=200,group=foo \
> > ...
> > 
> > will put both devices into the same group.
> 
> That's right, thanks for pointing it out. I can update the
> documentation. Alternatively we could use some other mechanism for
> auto-generating group names, rather than using the name of the device,
> but either way I think it deserves being documented.
> 
> > Also, as the two bps values don't match, I assume the last one
> > is used? Is there a warning when ignoring a config from previous
> > definitions?
> 
> Everytime a new device is added to a group it sets new values for the
> whole group, because values are not attached to any particular device.
> For the same reason if one device is removed the rest will continue
> working with the existing configuration.
> 
> > Thinking about this, I'd slightly prefer a canonical throttle group
> > definition rather than patching the existing parameters:
> > 
> > -object throttle-group,id=tg0,bps=100,iops=200,iops-max=1000 \
> > -drive file=foo,id=foo,throttle-group=tg0 \
> > -drive file=bar,id=bar,throttle-group=tg0 \
> > 
> > and error out if "bps=" etc are specified together with
> > "throttle-group=" in -drive.
> > 
> > And in QMP, add block_set_io_throttle_group which works together
> > with object-add.
> 
> And then you would create a group automatically for drives that have
> bps= specified?

Yes.

> What would happen if all members of the group are removed?

The automatic group created from "bps=" should be anonymous, which is attached
by the only one BDS. If it is removed, refcount should release the group
automatically.

> And what would happen if it's a group created with -object?

The -object has a refcount, so it should stay even when all members are
removed, unless "(QMP) object-del".
> 
> My first impression is that the idea is feasible, but I'm unsure at
> the moment of its complexity or possible side effects.

I think it is a cleaner model and easier to understand. What complexity are you
referring to?

> 
> > > @@ -1756,6 +1756,7 @@ Arguments:
> > >  - "iops_rd_max":  read I/O operations max (json-int)
> > >  - "iops_wr_max":  write I/O operations max (json-int)
> > >  - "iops_size":  I/O size in bytes when limiting (json-int)
> > 
> > Why removing these three?
> 
> There's nothing removed, that's just a list of bullets, note the
> column where the dashes appear :)

:D

Fam
Alberto Garcia April 2, 2015, 7:36 a.m. UTC | #4
On Thu, Apr 02, 2015 at 11:26:30AM +0800, Fam Zheng wrote:

> > > Thinking about this, I'd slightly prefer a canonical throttle
> > > group definition rather than patching the existing parameters:
> > > 
> > > -object throttle-group,id=tg0,bps=100,iops=200,iops-max=1000 \
> > > -drive file=foo,id=foo,throttle-group=tg0 \
> > > -drive file=bar,id=bar,throttle-group=tg0 \
> > > 
> > > and error out if "bps=" etc are specified together with
> > > "throttle-group=" in -drive.
> > > 
> > > And in QMP, add block_set_io_throttle_group which works together
> > > with object-add.
>
> > My first impression is that the idea is feasible, but I'm unsure
> > at the moment of its complexity or possible side effects.
>
> I think it is a cleaner model and easier to understand. What
> complexity are you referring to?

The changes on the code. At first sight it seems that it should not
be too difficult to convert the current implementation (and API) into
the one you are proposing, but I'll wait for a few days to hear other
opinions before attempting to rewrite it :)

Berto
Stefan Hajnoczi April 9, 2015, 2:22 p.m. UTC | #5
On Mon, Mar 30, 2015 at 07:19:42PM +0300, Alberto Garcia wrote:
> @@ -1941,9 +1951,11 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
>      aio_context_acquire(aio_context);
>  
>      if (!bs->io_limits_enabled && throttle_enabled(&cfg)) {
> -        bdrv_io_limits_enable(bs);
> +        bdrv_io_limits_enable(bs, has_group ? group : device);
>      } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
>          bdrv_io_limits_disable(bs);
> +    } else if (bs->io_limits_enabled && throttle_enabled(&cfg)) {
> +        bdrv_io_limits_update_group(bs, has_group ? group : device);
>      }
>  
>      if (bs->io_limits_enabled) {

The semantics are inconsistent:

1. Create drive0 with throttle group "mygroup".
2. Issue block-set-io-throttle device="drive0"

The result is that a new throttle group called "drive0" is created.  I
expected to modify the throttling configuration for drive0 (i.e.
"mygroup").

Now let's disable the throttle group:

3. Issue block-set-io-throttle with 0 values for device="drive0"

The result is that "mygroup" is changed to all 0s.

Enable should behave like disable and operate on the device's throttle
group.

Stefan
Alberto Garcia April 10, 2015, 7:58 a.m. UTC | #6
On Thu, Apr 09, 2015 at 03:22:57PM +0100, Stefan Hajnoczi wrote:

> > @@ -1941,9 +1951,11 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
> >      aio_context_acquire(aio_context);
> >  
> >      if (!bs->io_limits_enabled && throttle_enabled(&cfg)) {
> > -        bdrv_io_limits_enable(bs);
> > +        bdrv_io_limits_enable(bs, has_group ? group : device);
> >      } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
> >          bdrv_io_limits_disable(bs);
> > +    } else if (bs->io_limits_enabled && throttle_enabled(&cfg)) {
> > +        bdrv_io_limits_update_group(bs, has_group ? group : device);
> >      }
> >  
> >      if (bs->io_limits_enabled) {
> 
> The semantics are inconsistent:
> 
> 1. Create drive0 with throttle group "mygroup".
> 2. Issue block-set-io-throttle device="drive0"
> 
> The result is that a new throttle group called "drive0" is created.
> I expected to modify the throttling configuration for drive0 (i.e.
> "mygroup").

That's right. What we can do is choose the group to update using the
following criteria, in order of precedence:

1) The 'group' parameter from block-set-io-throttle.
2) The group the device is already member of.
3) A new group ("drive0" in this example).

Currently we're not doing 2).

> Now let's disable the throttle group:
> 
> 3. Issue block-set-io-throttle with 0 values for device="drive0"
> 
> The result is that "mygroup" is changed to all 0s.

No, that simply removes the device from the group without touching its
configuration:

> >      } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
> >          bdrv_io_limits_disable(bs);

The group name passed by the user is actually not used in this case.

Berto
Stefan Hajnoczi April 10, 2015, 9:52 a.m. UTC | #7
On Fri, Apr 10, 2015 at 09:58:34AM +0200, Alberto Garcia wrote:
> On Thu, Apr 09, 2015 at 03:22:57PM +0100, Stefan Hajnoczi wrote:
> 
> > > @@ -1941,9 +1951,11 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
> > >      aio_context_acquire(aio_context);
> > >  
> > >      if (!bs->io_limits_enabled && throttle_enabled(&cfg)) {
> > > -        bdrv_io_limits_enable(bs);
> > > +        bdrv_io_limits_enable(bs, has_group ? group : device);
> > >      } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
> > >          bdrv_io_limits_disable(bs);
> > > +    } else if (bs->io_limits_enabled && throttle_enabled(&cfg)) {
> > > +        bdrv_io_limits_update_group(bs, has_group ? group : device);
> > >      }
> > >  
> > >      if (bs->io_limits_enabled) {
> > 
> > The semantics are inconsistent:
> > 
> > 1. Create drive0 with throttle group "mygroup".
> > 2. Issue block-set-io-throttle device="drive0"
> > 
> > The result is that a new throttle group called "drive0" is created.
> > I expected to modify the throttling configuration for drive0 (i.e.
> > "mygroup").
> 
> That's right. What we can do is choose the group to update using the
> following criteria, in order of precedence:
> 
> 1) The 'group' parameter from block-set-io-throttle.
> 2) The group the device is already member of.
> 3) A new group ("drive0" in this example).
> 
> Currently we're not doing 2).

Great, it makes sense to add 2).

> > Now let's disable the throttle group:
> > 
> > 3. Issue block-set-io-throttle with 0 values for device="drive0"
> > 
> > The result is that "mygroup" is changed to all 0s.
> 
> No, that simply removes the device from the group without touching its
> configuration:
> 
> > >      } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
> > >          bdrv_io_limits_disable(bs);
> 
> The group name passed by the user is actually not used in this case.

You are right.

Stefan
Alberto Garcia April 10, 2015, 9:55 a.m. UTC | #8
On Fri, Apr 10, 2015 at 10:52:18AM +0100, Stefan Hajnoczi wrote:

> > That's right. What we can do is choose the group to update using
> > the following criteria, in order of precedence:
> > 
> > 1) The 'group' parameter from block-set-io-throttle.
> > 2) The group the device is already member of.
> > 3) A new group ("drive0" in this example).
> > 
> > Currently we're not doing 2).
> 
> Great, it makes sense to add 2).

Ok, I'll do that and explain the whole thing in the API doc.

Berto
diff mbox

Patch

diff --git a/block.c b/block.c
index 28508ef..5ca972f 100644
--- a/block.c
+++ b/block.c
@@ -36,6 +36,7 @@ 
 #include "qmp-commands.h"
 #include "qemu/timer.h"
 #include "qapi-event.h"
+#include "block/throttle-groups.h"
 
 #ifdef CONFIG_BSD
 #include <sys/types.h>
@@ -130,7 +131,7 @@  void bdrv_set_io_limits(BlockDriverState *bs,
 {
     int i;
 
-    throttle_config(&bs->throttle_state, &bs->throttle_timers, cfg);
+    throttle_group_config(bs, cfg);
 
     for (i = 0; i < 2; i++) {
         qemu_co_enter_next(&bs->throttled_reqs[i]);
@@ -160,70 +161,33 @@  static bool bdrv_start_throttled_reqs(BlockDriverState *bs)
 void bdrv_io_limits_disable(BlockDriverState *bs)
 {
     bs->io_limits_enabled = false;
-
     bdrv_start_throttled_reqs(bs);
-
-    throttle_timers_destroy(&bs->throttle_timers);
-}
-
-static void bdrv_throttle_read_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[0]);
-}
-
-static void bdrv_throttle_write_timer_cb(void *opaque)
-{
-    BlockDriverState *bs = opaque;
-    qemu_co_enter_next(&bs->throttled_reqs[1]);
+    throttle_group_unregister_bs(bs);
 }
 
 /* should be called before bdrv_set_io_limits if a limit is set */
-void bdrv_io_limits_enable(BlockDriverState *bs)
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group)
 {
     assert(!bs->io_limits_enabled);
-    throttle_init(&bs->throttle_state);
-    throttle_timers_init(&bs->throttle_timers,
-                         bdrv_get_aio_context(bs),
-                         QEMU_CLOCK_VIRTUAL,
-                         bdrv_throttle_read_timer_cb,
-                         bdrv_throttle_write_timer_cb,
-                         bs);
+    throttle_group_register_bs(bs, group);
     bs->io_limits_enabled = true;
 }
 
-/* This function makes an IO wait if needed
- *
- * @nb_sectors: the number of sectors of the IO
- * @is_write:   is the IO a write
- */
-static void bdrv_io_limits_intercept(BlockDriverState *bs,
-                                     unsigned int bytes,
-                                     bool is_write)
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group)
 {
-    /* does this io must wait */
-    bool must_wait = throttle_schedule_timer(&bs->throttle_state,
-                                             &bs->throttle_timers,
-                                             is_write);
-
-    /* if must wait or any request of this type throttled queue the IO */
-    if (must_wait ||
-        !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) {
-        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+    /* this bs is not part of any group */
+    if (!bs->throttle_state) {
+        return;
     }
 
-    /* the IO will be executed, do the accounting */
-    throttle_account(&bs->throttle_state, is_write, bytes);
-
-
-    /* if the next request must wait -> do nothing */
-    if (throttle_schedule_timer(&bs->throttle_state, &bs->throttle_timers,
-                                is_write)) {
+    /* this bs is a part of the same group than the one we want */
+    if (!g_strcmp0(throttle_group_get_name(bs), group)) {
         return;
     }
 
-    /* else queue next request for execution */
-    qemu_co_queue_next(&bs->throttled_reqs[is_write]);
+    /* need to change the group this bs belong to */
+    bdrv_io_limits_disable(bs);
+    bdrv_io_limits_enable(bs, group);
 }
 
 size_t bdrv_opt_mem_align(BlockDriverState *bs)
@@ -2091,15 +2055,18 @@  static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->enable_write_cache = bs_src->enable_write_cache;
 
     /* i/o throttled req */
-    memcpy(&bs_dest->throttle_state,
-           &bs_src->throttle_state,
-           sizeof(ThrottleState));
+    bs_dest->throttle_state     = bs_src->throttle_state,
+    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
+    bs_dest->pending_reqs[0]    = bs_src->pending_reqs[0];
+    bs_dest->pending_reqs[1]    = bs_src->pending_reqs[1];
+    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
+    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
+    memcpy(&bs_dest->round_robin,
+           &bs_src->round_robin,
+           sizeof(bs_dest->round_robin));
     memcpy(&bs_dest->throttle_timers,
            &bs_src->throttle_timers,
            sizeof(ThrottleTimers));
-    bs_dest->throttled_reqs[0]  = bs_src->throttled_reqs[0];
-    bs_dest->throttled_reqs[1]  = bs_src->throttled_reqs[1];
-    bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
 
     /* r/w error */
     bs_dest->on_read_error      = bs_src->on_read_error;
@@ -3170,7 +3137,7 @@  static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, false);
+        throttle_group_co_io_limits_intercept(bs, bytes, false);
     }
 
     /* Align read if necessary by padding qiov */
@@ -3411,7 +3378,7 @@  static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
 
     /* throttling disk I/O */
     if (bs->io_limits_enabled) {
-        bdrv_io_limits_intercept(bs, bytes, true);
+        throttle_group_co_io_limits_intercept(bs, bytes, true);
     }
 
     /*
diff --git a/block/qapi.c b/block/qapi.c
index 8a19aed..65af057 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -24,6 +24,7 @@ 
 
 #include "block/qapi.h"
 #include "block/block_int.h"
+#include "block/throttle-groups.h"
 #include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
@@ -63,7 +64,9 @@  BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
 
     if (bs->io_limits_enabled) {
         ThrottleConfig cfg;
-        throttle_get_config(&bs->throttle_state, &cfg);
+
+        throttle_group_get_config(bs, &cfg);
+
         info->bps     = cfg.buckets[THROTTLE_BPS_TOTAL].avg;
         info->bps_rd  = cfg.buckets[THROTTLE_BPS_READ].avg;
         info->bps_wr  = cfg.buckets[THROTTLE_BPS_WRITE].avg;
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 352077f..f909f50 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -23,6 +23,8 @@ 
  */
 
 #include "block/throttle-groups.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
 
 /* The ThrottleGroup structure (with its ThrottleState) is shared
  * among different BlockDriverState and it's independent from
@@ -160,6 +162,153 @@  static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs)
     return next;
 }
 
+/* Return the next BlockDriverState in the round-robin sequence with
+ * pending I/O requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ * @ret:       the next BlockDriverState with pending requests, or bs
+ *             if there is none.
+ */
+static BlockDriverState *next_throttle_token(BlockDriverState *bs,
+                                             bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    BlockDriverState *token, *start;
+
+    start = token = tg->tokens[is_write];
+
+    /* get next bs round in round robin style */
+    token = throttle_group_next_bs(token);
+    while (token != start && !token->pending_reqs[is_write]) {
+        token = throttle_group_next_bs(token);
+    }
+
+    /* If no IO are queued for scheduling on the next round robin token
+     * then decide the token is the current bs because chances are
+     * the current bs get the current request queued.
+     */
+    if (token == start && !token->pending_reqs[is_write]) {
+        token = bs;
+    }
+
+    return token;
+}
+
+/* Check if the next I/O request for a BlockDriverState needs to be
+ * throttled or not. If there's no timer set in this group, set one
+ * and update the token accordingly.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:         the current BlockDriverState
+ * @is_write:   the type of operation (read/write)
+ * @ret:        whether the I/O request needs to be throttled or not
+ */
+static bool throttle_group_schedule_timer(BlockDriverState *bs,
+                                          bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleTimers *tt = &bs->throttle_timers;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool must_wait;
+
+    /* Check if any of the timers in this group is already armed */
+    if (tg->any_timer_armed[is_write]) {
+        return true;
+    }
+
+    must_wait = throttle_schedule_timer(ts, tt, is_write);
+
+    /* If a timer just got armed, set bs as the current token */
+    if (must_wait) {
+        tg->tokens[is_write] = bs;
+        tg->any_timer_armed[is_write] = true;
+    }
+
+    return must_wait;
+}
+
+/* Look for the next pending I/O request and schedule it.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @bs:        the current BlockDriverState
+ * @is_write:  the type of operation (read/write)
+ */
+static void schedule_next_request(BlockDriverState *bs, bool is_write)
+{
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    bool must_wait;
+    BlockDriverState *token;
+
+    /* Check if there's any pending request to schedule next */
+    token = next_throttle_token(bs, is_write);
+    if (!token->pending_reqs[is_write]) {
+        return;
+    }
+
+    /* Set a timer for the request if it needs to be throttled */
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* If it doesn't have to wait, queue it for immediate execution */
+    if (!must_wait) {
+        /* Give preference to requests from the current bs */
+        if (qemu_in_coroutine() &&
+            qemu_co_queue_next(&bs->throttled_reqs[is_write])) {
+            token = bs;
+        } else {
+            ThrottleTimers *tt = &token->throttle_timers;
+            int64_t now = qemu_clock_get_ns(tt->clock_type);
+            timer_mod(tt->timers[is_write], now + 1);
+            tg->any_timer_armed[is_write] = true;
+        }
+        tg->tokens[is_write] = token;
+    }
+}
+
+/* Check if an I/O request needs to be throttled, wait and set a timer
+ * if necessary, and schedule the next request using a round robin
+ * algorithm.
+ *
+ * @bs:        the current BlockDriverState
+ * @bytes:     the number of bytes for this I/O
+ * @is_write:  the type of operation (read/write)
+ */
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+                                                        unsigned int bytes,
+                                                        bool is_write)
+{
+    bool must_wait;
+    BlockDriverState *token;
+
+    ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
+    qemu_mutex_lock(&tg->lock);
+
+    /* First we check if this I/O has to be throttled. */
+    token = next_throttle_token(bs, is_write);
+    must_wait = throttle_group_schedule_timer(token, is_write);
+
+    /* Wait if there's a timer set or queued requests of this type */
+    if (must_wait || bs->pending_reqs[is_write]) {
+        bs->pending_reqs[is_write]++;
+        qemu_mutex_unlock(&tg->lock);
+        qemu_co_queue_wait(&bs->throttled_reqs[is_write]);
+        qemu_mutex_lock(&tg->lock);
+        bs->pending_reqs[is_write]--;
+    }
+
+    /* The I/O will be executed, so do the accounting */
+    throttle_account(bs->throttle_state, is_write, bytes);
+
+    /* Schedule the next request */
+    schedule_next_request(bs, is_write);
+
+    qemu_mutex_unlock(&tg->lock);
+}
+
 /* Update the throttle configuration for a particular group. Similar
  * to throttle_config(), but guarantees atomicity within the
  * throttling group.
@@ -195,9 +344,49 @@  void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg)
     qemu_mutex_unlock(&tg->lock);
 }
 
-/* Register a BlockDriverState in the throttling group, also updating
- * its throttle_state pointer to point to it. If a throttling group
- * with that name does not exist yet, it will be created.
+/* ThrottleTimers callback. This wakes up a request that was waiting
+ * because it had been throttled.
+ *
+ * @bs:        the BlockDriverState whose request had been throttled
+ * @is_write:  the type of operation (read/write)
+ */
+static void timer_cb(BlockDriverState *bs, bool is_write)
+{
+    ThrottleState *ts = bs->throttle_state;
+    ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts);
+    bool empty_queue;
+
+    /* The timer has just been fired, so we can update the flag */
+    qemu_mutex_lock(&tg->lock);
+    tg->any_timer_armed[is_write] = false;
+    qemu_mutex_unlock(&tg->lock);
+
+    /* Run the request that was waiting for this timer */
+    empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]);
+
+    /* If the request queue was empty then we have to take care of
+     * scheduling the next one */
+    if (empty_queue) {
+        qemu_mutex_lock(&tg->lock);
+        schedule_next_request(bs, is_write);
+        qemu_mutex_unlock(&tg->lock);
+    }
+}
+
+static void read_timer_cb(void *opaque)
+{
+    timer_cb(opaque, false);
+}
+
+static void write_timer_cb(void *opaque)
+{
+    timer_cb(opaque, true);
+}
+
+/* Register a BlockDriverState in the throttling group, also
+ * initializing its timers and updating its throttle_state pointer to
+ * point to it. If a throttling group with that name does not exist
+ * yet, it will be created.
  *
  * @bs:        the BlockDriverState to insert
  * @groupname: the name of the group
@@ -218,11 +407,20 @@  void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
     }
 
     QLIST_INSERT_HEAD(&tg->head, bs, round_robin);
+
+    throttle_timers_init(&bs->throttle_timers,
+                         bdrv_get_aio_context(bs),
+                         QEMU_CLOCK_VIRTUAL,
+                         read_timer_cb,
+                         write_timer_cb,
+                         bs);
+
     qemu_mutex_unlock(&tg->lock);
 }
 
 /* Unregister a BlockDriverState from its group, removing it from the
- * list and setting the throttle_state pointer to NULL.
+ * list, destroying the timers and setting the throttle_state pointer
+ * to NULL.
  *
  * The group will be destroyed if it's empty after this operation.
  *
@@ -247,6 +445,7 @@  void throttle_group_unregister_bs(BlockDriverState *bs)
 
     /* remove the current bs from the list */
     QLIST_REMOVE(bs, round_robin);
+    throttle_timers_destroy(&bs->throttle_timers);
     qemu_mutex_unlock(&tg->lock);
 
     throttle_group_unref(tg);
diff --git a/blockdev.c b/blockdev.c
index fbb3a79..98ef26f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -357,6 +357,7 @@  static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     const char *id;
     bool has_driver_specific_opts;
     BlockdevDetectZeroesOptions detect_zeroes;
+    const char *throttling_group;
 
     /* Check common options by copying from bs_opts to opts, all other options
      * stay in bs_opts for processing by bdrv_open(). */
@@ -459,6 +460,8 @@  static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
 
     cfg.op_size = qemu_opt_get_number(opts, "throttling.iops-size", 0);
 
+    throttling_group = qemu_opt_get(opts, "throttling.group");
+
     if (!check_throttle_config(&cfg, &error)) {
         error_propagate(errp, error);
         goto early_err;
@@ -547,7 +550,10 @@  static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
 
     /* disk I/O throttling */
     if (throttle_enabled(&cfg)) {
-        bdrv_io_limits_enable(bs);
+        if (!throttling_group) {
+            throttling_group = blk_name(blk);
+        }
+        bdrv_io_limits_enable(bs, throttling_group);
         bdrv_set_io_limits(bs, &cfg);
     }
 
@@ -711,6 +717,8 @@  DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType block_default_type)
 
         { "iops_size",      "throttling.iops-size" },
 
+        { "group",          "throttling.group" },
+
         { "readonly",       "read-only" },
     };
 
@@ -1887,7 +1895,9 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
                                bool has_iops_wr_max,
                                int64_t iops_wr_max,
                                bool has_iops_size,
-                               int64_t iops_size, Error **errp)
+                               int64_t iops_size,
+                               bool has_group,
+                               const char *group, Error **errp)
 {
     ThrottleConfig cfg;
     BlockDriverState *bs;
@@ -1941,9 +1951,11 @@  void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
     aio_context_acquire(aio_context);
 
     if (!bs->io_limits_enabled && throttle_enabled(&cfg)) {
-        bdrv_io_limits_enable(bs);
+        bdrv_io_limits_enable(bs, has_group ? group : device);
     } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) {
         bdrv_io_limits_disable(bs);
+    } else if (bs->io_limits_enabled && throttle_enabled(&cfg)) {
+        bdrv_io_limits_update_group(bs, has_group ? group : device);
     }
 
     if (bs->io_limits_enabled) {
@@ -3017,6 +3029,10 @@  QemuOptsList qemu_common_drive_opts = {
             .type = QEMU_OPT_NUMBER,
             .help = "when limiting by iops max size of an I/O in bytes",
         },{
+            .name = "throttling.group",
+            .type = QEMU_OPT_STRING,
+            .help = "name of the block throttling group",
+        },{
             .name = "copy-on-read",
             .type = QEMU_OPT_BOOL,
             .help = "copy read data from backing file into image file",
diff --git a/hmp.c b/hmp.c
index f31ae27..50f30f2 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1272,7 +1272,9 @@  void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict)
                               false,
                               0,
                               false, /* No default I/O size */
-                              0, &err);
+                              0,
+                              false,
+                              NULL, &err);
     hmp_handle_error(mon, &err);
 }
 
diff --git a/include/block/block.h b/include/block/block.h
index 4c57d63..e0d53f3 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -173,8 +173,9 @@  void bdrv_stats_print(Monitor *mon, const QObject *data);
 void bdrv_info_stats(Monitor *mon, QObject **ret_data);
 
 /* disk I/O throttling */
-void bdrv_io_limits_enable(BlockDriverState *bs);
+void bdrv_io_limits_enable(BlockDriverState *bs, const char *group);
 void bdrv_io_limits_disable(BlockDriverState *bs);
+void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group);
 
 void bdrv_init(void);
 void bdrv_init_with_whitelist(void);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index f0896e9..4e09825 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -376,10 +376,13 @@  struct BlockDriverState {
     unsigned int serialising_in_flight;
 
     /* I/O throttling */
-    ThrottleState throttle_state;
-    ThrottleTimers throttle_timers;
     CoQueue      throttled_reqs[2];
     bool         io_limits_enabled;
+    /* The following fields are protected by the ThrottleGroup lock.
+     * See the ThrottleGroup documentation for details. */
+    ThrottleState *throttle_state;
+    ThrottleTimers throttle_timers;
+    unsigned       pending_reqs[2];
     QLIST_ENTRY(BlockDriverState) round_robin;
 
     /* I/O stats (display with "info blockstats"). */
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
index b966ec7..322139a 100644
--- a/include/block/throttle-groups.h
+++ b/include/block/throttle-groups.h
@@ -36,4 +36,8 @@  void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg);
 void throttle_group_register_bs(BlockDriverState *bs, const char *groupname);
 void throttle_group_unregister_bs(BlockDriverState *bs);
 
+void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs,
+                                                        unsigned int bytes,
+                                                        bool is_write);
+
 #endif
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 7873084..d8211b7 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -990,6 +990,8 @@ 
 #
 # @iops_size: #optional an I/O size in bytes (Since 1.7)
 #
+# @group: #optional throttle group name (Since 2.3)
+#
 # Returns: Nothing on success
 #          If @device is not a valid block device, DeviceNotFound
 #
@@ -1001,7 +1003,7 @@ 
             '*bps_max': 'int', '*bps_rd_max': 'int',
             '*bps_wr_max': 'int', '*iops_max': 'int',
             '*iops_rd_max': 'int', '*iops_wr_max': 'int',
-            '*iops_size': 'int' } }
+            '*iops_size': 'int', '*group': 'str' } }
 
 ##
 # @block-stream:
diff --git a/qemu-options.hx b/qemu-options.hx
index 319d971..7d6b2ec 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -464,6 +464,7 @@  DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "       [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n"
     "       [[,iops_max=im]|[[,iops_rd_max=irm][,iops_wr_max=iwm]]]\n"
     "       [[,iops_size=is]]\n"
+    "       [[,group=g]]\n"
     "                use 'file' as a drive image\n", QEMU_ARCH_ALL)
 STEXI
 @item -drive @var{option}[,@var{option}[,@var{option}[,...]]]
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 3a42ad0..ce897ff 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1730,7 +1730,7 @@  EQMP
 
     {
         .name       = "block_set_io_throttle",
-        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l,bps_max:l?,bps_rd_max:l?,bps_wr_max:l?,iops_max:l?,iops_rd_max:l?,iops_wr_max:l?,iops_size:l?",
+        .args_type  = "device:B,bps:l,bps_rd:l,bps_wr:l,iops:l,iops_rd:l,iops_wr:l,bps_max:l?,bps_rd_max:l?,bps_wr_max:l?,iops_max:l?,iops_rd_max:l?,iops_wr_max:l?,iops_size:l?,group:s?",
         .mhandler.cmd_new = qmp_marshal_input_block_set_io_throttle,
     },
 
@@ -1756,6 +1756,7 @@  Arguments:
 - "iops_rd_max":  read I/O operations max (json-int)
 - "iops_wr_max":  write I/O operations max (json-int)
 - "iops_size":  I/O size in bytes when limiting (json-int)
+- "group": throttle group name (json-string)
 
 Example: