Patchwork [RFC,1/4] block: fix I/O throttling accounting blind spot

login
register
mail settings
Submitter Stefan Hajnoczi
Date March 21, 2013, 2:49 p.m.
Message ID <1363877399-16339-2-git-send-email-stefanha@redhat.com>
Download mbox | patch
Permalink /patch/229724/
State New
Headers show

Comments

Stefan Hajnoczi - March 21, 2013, 2:49 p.m.
I/O throttling relies on bdrv_acct_done() which is called when a request
completes.  This leaves a blind spot since we only charge for completed
requests, not submitted requests.

For example, if there is 1 operation remaining in this time slice the
guest could submit 3 operations and they will all be submitted
successfully since they don't actually get accounted for until they
complete.

Originally we probably thought this is okay since the requests will be
accounted when the time slice is extended.  In practice it causes
fluctuations since the guest can exceed its I/O limit and it will be
punished for this later on.

Account for I/O upon submission so that I/O limits are enforced
properly.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c                   | 19 ++++++++-----------
 include/block/block_int.h |  2 +-
 2 files changed, 9 insertions(+), 12 deletions(-)
Zhiyong Wu - March 27, 2013, 8:50 a.m.
On Thu, Mar 21, 2013 at 10:49 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> I/O throttling relies on bdrv_acct_done() which is called when a request
> completes.  This leaves a blind spot since we only charge for completed
> requests, not submitted requests.
>
> For example, if there is 1 operation remaining in this time slice the
> guest could submit 3 operations and they will all be submitted
> successfully since they don't actually get accounted for until they
> complete.
>
> Originally we probably thought this is okay since the requests will be
> accounted when the time slice is extended.  In practice it causes
> fluctuations since the guest can exceed its I/O limit and it will be
> punished for this later on.
>
> Account for I/O upon submission so that I/O limits are enforced
> properly.
>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  block.c                   | 19 ++++++++-----------
>  include/block/block_int.h |  2 +-
>  2 files changed, 9 insertions(+), 12 deletions(-)
>
> diff --git a/block.c b/block.c
> index 0a062c9..31fb0b0 100644
> --- a/block.c
> +++ b/block.c
> @@ -141,7 +141,6 @@ void bdrv_io_limits_disable(BlockDriverState *bs)
>      bs->slice_start = 0;
>      bs->slice_end   = 0;
>      bs->slice_time  = 0;
> -    memset(&bs->io_base, 0, sizeof(bs->io_base));
If we try some concussive operations, enable I/O throttling at first,
then disable it, and then enable it, how about? I guess that
bs->slice_submitted will maybe be not correct.
>  }
>
>  static void bdrv_block_timer(void *opaque)
> @@ -1329,8 +1328,8 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
>      bs_dest->slice_time         = bs_src->slice_time;
>      bs_dest->slice_start        = bs_src->slice_start;
>      bs_dest->slice_end          = bs_src->slice_end;
> +    bs_dest->slice_submitted    = bs_src->slice_submitted;
>      bs_dest->io_limits          = bs_src->io_limits;
> -    bs_dest->io_base            = bs_src->io_base;
>      bs_dest->throttled_reqs     = bs_src->throttled_reqs;
>      bs_dest->block_timer        = bs_src->block_timer;
>      bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
> @@ -3665,9 +3664,9 @@ static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>      slice_time = bs->slice_end - bs->slice_start;
>      slice_time /= (NANOSECONDS_PER_SECOND);
>      bytes_limit = bps_limit * slice_time;
> -    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
> +    bytes_base  = bs->slice_submitted.bytes[is_write];
>      if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
> -        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
> +        bytes_base += bs->slice_submitted.bytes[!is_write];
>      }
>
>      /* bytes_base: the bytes of data which have been read/written; and
> @@ -3683,6 +3682,7 @@ static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
>              *wait = 0;
>          }
>
> +        bs->slice_submitted.bytes[is_write] += bytes_res;
>          return false;
>      }
>
> @@ -3725,9 +3725,9 @@ static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>      slice_time = bs->slice_end - bs->slice_start;
>      slice_time /= (NANOSECONDS_PER_SECOND);
>      ios_limit  = iops_limit * slice_time;
> -    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
> +    ios_base   = bs->slice_submitted.ios[is_write];
>      if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
> -        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
> +        ios_base += bs->slice_submitted.ios[!is_write];
>      }
>
>      if (ios_base + 1 <= ios_limit) {
> @@ -3735,6 +3735,7 @@ static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
>              *wait = 0;
>          }
>
> +        bs->slice_submitted.ios[is_write]++;
>          return false;
>      }
>
> @@ -3772,11 +3773,7 @@ static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>          bs->slice_start = now;
>          bs->slice_end   = now + bs->slice_time;
>
> -        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
> -        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
> -
> -        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
> -        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
> +        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
>      }
>
>      elapsed_time  = now - bs->slice_start;
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index ce0aa26..b461764 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -251,7 +251,7 @@ struct BlockDriverState {
>      int64_t slice_start;
>      int64_t slice_end;
>      BlockIOLimit io_limits;
> -    BlockIOBaseValue  io_base;
> +    BlockIOBaseValue slice_submitted;
>      CoQueue      throttled_reqs;
>      QEMUTimer    *block_timer;
>      bool         io_limits_enabled;
> --
> 1.8.1.4
>
>
Stefan Hajnoczi - March 27, 2013, 9:14 a.m.
On Wed, Mar 27, 2013 at 9:50 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Thu, Mar 21, 2013 at 10:49 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>> I/O throttling relies on bdrv_acct_done() which is called when a request
>> completes.  This leaves a blind spot since we only charge for completed
>> requests, not submitted requests.
>>
>> For example, if there is 1 operation remaining in this time slice the
>> guest could submit 3 operations and they will all be submitted
>> successfully since they don't actually get accounted for until they
>> complete.
>>
>> Originally we probably thought this is okay since the requests will be
>> accounted when the time slice is extended.  In practice it causes
>> fluctuations since the guest can exceed its I/O limit and it will be
>> punished for this later on.
>>
>> Account for I/O upon submission so that I/O limits are enforced
>> properly.
>>
>> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>> ---
>>  block.c                   | 19 ++++++++-----------
>>  include/block/block_int.h |  2 +-
>>  2 files changed, 9 insertions(+), 12 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 0a062c9..31fb0b0 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -141,7 +141,6 @@ void bdrv_io_limits_disable(BlockDriverState *bs)
>>      bs->slice_start = 0;
>>      bs->slice_end   = 0;
>>      bs->slice_time  = 0;
>> -    memset(&bs->io_base, 0, sizeof(bs->io_base));
> If we try some concussive operations, enable I/O throttling at first,
> then disable it, and then enable it, how about? I guess that
> bs->slice_submitted will maybe be not correct.

The memset() was moved...

>> @@ -3772,11 +3773,7 @@ static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>          bs->slice_start = now;
>>          bs->slice_end   = now + bs->slice_time;
>>
>> -        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
>> -        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
>> -
>> -        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
>> -        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
>> +        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
>>      }
>>
>>      elapsed_time  = now - bs->slice_start;

...here.

Since bs->slice_start = 0 when I/O throttling is disabled we will
start a new slice next time bdrv_exceed_io_limits() is called.

Therefore bs->slice_submitted is consistent across disable/enable.

Stefan
Zhiyong Wu - March 27, 2013, 12:49 p.m.
On Wed, Mar 27, 2013 at 5:14 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Mar 27, 2013 at 9:50 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Thu, Mar 21, 2013 at 10:49 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> I/O throttling relies on bdrv_acct_done() which is called when a request
>>> completes.  This leaves a blind spot since we only charge for completed
>>> requests, not submitted requests.
>>>
>>> For example, if there is 1 operation remaining in this time slice the
>>> guest could submit 3 operations and they will all be submitted
>>> successfully since they don't actually get accounted for until they
>>> complete.
>>>
>>> Originally we probably thought this is okay since the requests will be
>>> accounted when the time slice is extended.  In practice it causes
>>> fluctuations since the guest can exceed its I/O limit and it will be
>>> punished for this later on.
>>>
>>> Account for I/O upon submission so that I/O limits are enforced
>>> properly.
>>>
>>> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
>>> ---
>>>  block.c                   | 19 ++++++++-----------
>>>  include/block/block_int.h |  2 +-
>>>  2 files changed, 9 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 0a062c9..31fb0b0 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -141,7 +141,6 @@ void bdrv_io_limits_disable(BlockDriverState *bs)
>>>      bs->slice_start = 0;
>>>      bs->slice_end   = 0;
>>>      bs->slice_time  = 0;
>>> -    memset(&bs->io_base, 0, sizeof(bs->io_base));
>> If we try some concussive operations, enable I/O throttling at first,
>> then disable it, and then enable it, how about? I guess that
>> bs->slice_submitted will maybe be not correct.
>
> The memset() was moved...
>
>>> @@ -3772,11 +3773,7 @@ static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
>>>          bs->slice_start = now;
>>>          bs->slice_end   = now + bs->slice_time;
>>>
>>> -        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
>>> -        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
>>> -
>>> -        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
>>> -        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
>>> +        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
>>>      }
>>>
>>>      elapsed_time  = now - bs->slice_start;
>
> ...here.
>
> Since bs->slice_start = 0 when I/O throttling is disabled we will
> start a new slice next time bdrv_exceed_io_limits() is called.
>
> Therefore bs->slice_submitted is consistent across disable/enable.
Yes, i also realized this just when i came home by subway.
>
> Stefan

Patch

diff --git a/block.c b/block.c
index 0a062c9..31fb0b0 100644
--- a/block.c
+++ b/block.c
@@ -141,7 +141,6 @@  void bdrv_io_limits_disable(BlockDriverState *bs)
     bs->slice_start = 0;
     bs->slice_end   = 0;
     bs->slice_time  = 0;
-    memset(&bs->io_base, 0, sizeof(bs->io_base));
 }
 
 static void bdrv_block_timer(void *opaque)
@@ -1329,8 +1328,8 @@  static void bdrv_move_feature_fields(BlockDriverState *bs_dest,
     bs_dest->slice_time         = bs_src->slice_time;
     bs_dest->slice_start        = bs_src->slice_start;
     bs_dest->slice_end          = bs_src->slice_end;
+    bs_dest->slice_submitted    = bs_src->slice_submitted;
     bs_dest->io_limits          = bs_src->io_limits;
-    bs_dest->io_base            = bs_src->io_base;
     bs_dest->throttled_reqs     = bs_src->throttled_reqs;
     bs_dest->block_timer        = bs_src->block_timer;
     bs_dest->io_limits_enabled  = bs_src->io_limits_enabled;
@@ -3665,9 +3664,9 @@  static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
     slice_time = bs->slice_end - bs->slice_start;
     slice_time /= (NANOSECONDS_PER_SECOND);
     bytes_limit = bps_limit * slice_time;
-    bytes_base  = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write];
+    bytes_base  = bs->slice_submitted.bytes[is_write];
     if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) {
-        bytes_base += bs->nr_bytes[!is_write] - bs->io_base.bytes[!is_write];
+        bytes_base += bs->slice_submitted.bytes[!is_write];
     }
 
     /* bytes_base: the bytes of data which have been read/written; and
@@ -3683,6 +3682,7 @@  static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors,
             *wait = 0;
         }
 
+        bs->slice_submitted.bytes[is_write] += bytes_res;
         return false;
     }
 
@@ -3725,9 +3725,9 @@  static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
     slice_time = bs->slice_end - bs->slice_start;
     slice_time /= (NANOSECONDS_PER_SECOND);
     ios_limit  = iops_limit * slice_time;
-    ios_base   = bs->nr_ops[is_write] - bs->io_base.ios[is_write];
+    ios_base   = bs->slice_submitted.ios[is_write];
     if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) {
-        ios_base += bs->nr_ops[!is_write] - bs->io_base.ios[!is_write];
+        ios_base += bs->slice_submitted.ios[!is_write];
     }
 
     if (ios_base + 1 <= ios_limit) {
@@ -3735,6 +3735,7 @@  static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write,
             *wait = 0;
         }
 
+        bs->slice_submitted.ios[is_write]++;
         return false;
     }
 
@@ -3772,11 +3773,7 @@  static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors,
         bs->slice_start = now;
         bs->slice_end   = now + bs->slice_time;
 
-        bs->io_base.bytes[is_write]  = bs->nr_bytes[is_write];
-        bs->io_base.bytes[!is_write] = bs->nr_bytes[!is_write];
-
-        bs->io_base.ios[is_write]    = bs->nr_ops[is_write];
-        bs->io_base.ios[!is_write]   = bs->nr_ops[!is_write];
+        memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted));
     }
 
     elapsed_time  = now - bs->slice_start;
diff --git a/include/block/block_int.h b/include/block/block_int.h
index ce0aa26..b461764 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -251,7 +251,7 @@  struct BlockDriverState {
     int64_t slice_start;
     int64_t slice_end;
     BlockIOLimit io_limits;
-    BlockIOBaseValue  io_base;
+    BlockIOBaseValue slice_submitted;
     CoQueue      throttled_reqs;
     QEMUTimer    *block_timer;
     bool         io_limits_enabled;