diff mbox

[v3,09/21] block: Add average I/O queue depth to BlockDeviceTimedStats

Message ID 553b81f91762993bf5d43a691aedb4fa2168bc3a.1445500464.git.berto@igalia.com
State New
Headers show

Commit Message

Alberto Garcia Oct. 22, 2015, 8:11 a.m. UTC
This patch adds two new fields to BlockDeviceTimedStats that track the
average number of pending read and write requests for a block device.

The values are calculated for the period of time defined for that
interval.

Signed-off-by: Alberto Garcia <berto@igalia.com>
---
 block/accounting.c           | 12 ++++++++++++
 block/qapi.c                 |  5 +++++
 include/block/accounting.h   |  2 ++
 include/qemu/timed-average.h |  1 +
 qapi/block-core.json         |  9 ++++++++-
 qmp-commands.hx              |  6 ++++++
 util/timed-average.c         | 17 +++++++++++++++++
 7 files changed, 51 insertions(+), 1 deletion(-)

Comments

Stefan Hajnoczi Oct. 23, 2015, 1:31 p.m. UTC | #1
On Thu, Oct 22, 2015 at 11:11:19AM +0300, Alberto Garcia wrote:
> +/* Get the sum of all accounted values
> + * @ta:      the TimedAverage structure
> + * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here
> + * @ret:     the sum of all accounted values
> + */
> +uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed)
> +{
> +    TimedAverageWindow *w;
> +    check_expirations(ta);
> +    w = current_window(ta);
> +    if (elapsed != NULL) {
> +        int64_t remaining = w->expiration - qemu_clock_get_ns(ta->clock_type);
> +        *elapsed = ta->period - remaining;

There is no guarantee that qemu_clock_get_ns(ta->clock_type) executes
quickly after check_expirations(ta).  If the system is under heavy load
and swapping, maybe significant amounts of time have passed.  This race
condition could result in bad elapsed values being calculated.

We need to be careful about elapsed <= 0 values, especially if the
caller divides by elapsed and is exposed to Divide By 0 exceptions.

Either check_expirations() needs to integrate qemu_clock_get_ns() or we
simply need to deal with bogus values here.
Alberto Garcia Oct. 23, 2015, 1:47 p.m. UTC | #2
On Fri 23 Oct 2015 03:31:38 PM CEST, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>> +uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed)
>> +{
>> +    TimedAverageWindow *w;
>> +    check_expirations(ta);
>> +    w = current_window(ta);
>> +    if (elapsed != NULL) {
>> +        int64_t remaining = w->expiration - qemu_clock_get_ns(ta->clock_type);
>> +        *elapsed = ta->period - remaining;
>
> There is no guarantee that qemu_clock_get_ns(ta->clock_type) executes
> quickly after check_expirations(ta).  If the system is under heavy load
> and swapping, maybe significant amounts of time have passed.  This race
> condition could result in bad elapsed values being calculated.
>
> We need to be careful about elapsed <= 0 values, especially if the
> caller divides by elapsed and is exposed to Divide By 0 exceptions.
>
> Either check_expirations() needs to integrate qemu_clock_get_ns() or we
> simply need to deal with bogus values here.

Good catch, thanks!

I think having check_expirations() return the time is probably the best
option.

Berto
diff mbox

Patch

diff --git a/block/accounting.c b/block/accounting.c
index 61de8ce..a941931 100644
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -143,3 +143,15 @@  int64_t block_acct_idle_time_ns(BlockAcctStats *stats)
 {
     return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns;
 }
+
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type)
+{
+    uint64_t sum, elapsed;
+
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    sum = timed_average_sum(&stats->latency[type], &elapsed);
+
+    return (double) sum / elapsed;
+}
diff --git a/block/qapi.c b/block/qapi.c
index 4baf6e1..99d5303 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -402,6 +402,11 @@  static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
             dev_stats->min_flush_latency_ns = timed_average_min(fl);
             dev_stats->max_flush_latency_ns = timed_average_max(fl);
             dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
+
+            dev_stats->avg_rd_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_READ);
+            dev_stats->avg_wr_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
         }
     }
 
diff --git a/include/block/accounting.h b/include/block/accounting.h
index 09605bb..f41ddde 100644
--- a/include/block/accounting.h
+++ b/include/block/accounting.h
@@ -78,5 +78,7 @@  void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type);
 void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
                            int num_requests);
 int64_t block_acct_idle_time_ns(BlockAcctStats *stats);
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type);
 
 #endif
diff --git a/include/qemu/timed-average.h b/include/qemu/timed-average.h
index f1cdddc..364bf88 100644
--- a/include/qemu/timed-average.h
+++ b/include/qemu/timed-average.h
@@ -59,5 +59,6 @@  void timed_average_account(TimedAverage *ta, uint64_t value);
 uint64_t timed_average_min(TimedAverage *ta);
 uint64_t timed_average_avg(TimedAverage *ta);
 uint64_t timed_average_max(TimedAverage *ta);
+uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed);
 
 #endif
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 741f7e6..e32b523 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -450,6 +450,12 @@ 
 # @avg_flush_latency_ns: Average latency of flush operations in the
 #                        defined interval, in nanoseconds.
 #
+# @avg_rd_queue_depth: Average number of pending read operations
+#                      in the defined interval.
+#
+# @avg_wr_queue_depth: Average number of pending write operations
+#                      in the defined interval.
+#
 # Since: 2.5
 ##
 
@@ -458,7 +464,8 @@ 
             'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int',
             'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int',
             'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int',
-            'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int' } }
+            'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int',
+            'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } }
 
 ##
 # @BlockDeviceStats:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 9f1d2ab..526a317 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -2578,6 +2578,12 @@  Each json-object contain the following:
         - "avg_flush_latency_ns": average latency of flush operations
                                   in the defined interval, in
                                   nanoseconds (json-int)
+        - "avg_rd_queue_depth": average number of pending read
+                                operations in the defined interval
+                                (json-number)
+        - "avg_wr_queue_depth": average number of pending write
+                                operations in the defined interval
+                                (json-number).
 - "parent": Contains recursively the statistics of the underlying
             protocol (e.g. the host file for a qcow2 image). If there is
             no underlying protocol, this field is omitted
diff --git a/util/timed-average.c b/util/timed-average.c
index 98a1170..70926ef 100644
--- a/util/timed-average.c
+++ b/util/timed-average.c
@@ -208,3 +208,20 @@  uint64_t timed_average_max(TimedAverage *ta)
     check_expirations(ta);
     return current_window(ta)->max;
 }
+
+/* Get the sum of all accounted values
+ * @ta:      the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here
+ * @ret:     the sum of all accounted values
+ */
+uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta);
+    w = current_window(ta);
+    if (elapsed != NULL) {
+        int64_t remaining = w->expiration - qemu_clock_get_ns(ta->clock_type);
+        *elapsed = ta->period - remaining;
+    }
+    return w->sum;
+}