diff mbox series

[ovs-dev] utilities: Add upcall statistics to the kernel_delay.py script.

Message ID 173254404327.545539.5146903122430077833.stgit@wsfd-advnetlab224.anl.eng.rdu2.dc.redhat.com
State New
Delegated to: aaron conole
Headers show
Series [ovs-dev] utilities: Add upcall statistics to the kernel_delay.py script. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test fail github build: failed
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

Eelco Chaudron Nov. 25, 2024, 2:14 p.m. UTC
This patch installs a kernel return probe on ovs_dp_upcall() to record
all successful and failed calls per CPU. The statistics are reported
when the script exits, providing insights into the upcall behavior.

This is an example output:

  # UPCALL STATISTICS:
    Total upcalls       : 183247 [0: 8937, 2: 14464, 4: 10372, 6: 4254, ...]
    Successfull upcalls : 120195 [0: 5945, 2: 6379, 4: 5966, 6: 4254, ...]
    Failed upcalls      : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]
       11, EAGAIN       : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 utilities/automake.mk                   |    1 
 utilities/usdt-scripts/kernel_delay.py  |  127 +++++++++++++++++++++++++++++++
 utilities/usdt-scripts/kernel_delay.rst |    9 ++
 3 files changed, 136 insertions(+), 1 deletion(-)

Comments

Eelco Chaudron Nov. 27, 2024, 7:40 a.m. UTC | #1
On 25 Nov 2024, at 15:14, Eelco Chaudron wrote:

> This patch installs a kernel return probe on ovs_dp_upcall() to record
> all successful and failed calls per CPU. The statistics are reported
> when the script exits, providing insights into the upcall behavior.
>
> This is an example output:
>
>   # UPCALL STATISTICS:
>     Total upcalls       : 183247 [0: 8937, 2: 14464, 4: 10372, 6: 4254, ...]
>     Successfull upcalls : 120195 [0: 5945, 2: 6379, 4: 5966, 6: 4254, ...]
>     Failed upcalls      : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]
>        11, EAGAIN       : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]
>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

Recheck-request: github-robot
diff mbox series

Patch

diff --git a/utilities/automake.mk b/utilities/automake.mk
index acc1af4e0..22260b254 100644
--- a/utilities/automake.mk
+++ b/utilities/automake.mk
@@ -149,6 +149,7 @@  FLAKE8_PYFILES += utilities/ovs-pcap.in \
 	utilities/ovs-pipegen.py \
 	utilities/usdt-scripts/dpif_op_nl_monitor.py \
 	utilities/usdt-scripts/flow_reval_monitor.py \
+	utilities/usdt-scripts/kernel_delay.py \
 	utilities/usdt-scripts/upcall_monitor.py \
 	utilities/usdt-scripts/upcall_cost.py
 
diff --git a/utilities/usdt-scripts/kernel_delay.py b/utilities/usdt-scripts/kernel_delay.py
index de6b0c9de..a5aa1a3d4 100755
--- a/utilities/usdt-scripts/kernel_delay.py
+++ b/utilities/usdt-scripts/kernel_delay.py
@@ -39,6 +39,7 @@ 
 #
 import argparse
 import datetime
+import errno
 import os
 import pytz
 import psutil
@@ -556,6 +557,30 @@  TRACEPOINT_PROBE(irq, softirq_exit)
     data->start_ns = 0;
     return 0;
 }
+
+
+/*
+ * For measuring upcall statistics (per CPU).
+ */
+#define MAX_ERROR 255
+BPF_TABLE("percpu_array", uint32_t, uint64_t, upcall_count, MAX_ERROR + 1);
+
+#if <INSTALL_OVS_DP_UPCALL_PROBE>
+int kretprobe__ovs_dp_upcall(struct pt_regs *ctx)
+{
+    int ret = PT_REGS_RC(ctx);
+
+    if (ret > 0)
+        ret = 0;
+
+    ret = -ret;
+    if (ret > MAX_ERROR)
+        ret = MAX_ERROR;
+
+    upcall_count.increment(ret);
+    return 0;
+}
+#endif /* For measuring upcall statistics/errors. */
 """
 
 
@@ -900,6 +925,95 @@  def print_timestamp(msg):
     print(time_string)
 
 
+#
+# Get errno short string
+#
+def get_errno_short(err):
+    try:
+        return errno.errorcode[err]
+    except KeyError:
+        return "_unknown_"
+
+
+#
+# Format a bfd per cpu array entry per cpu (if the count is > 0)
+#
+def format_per_cpu_array(cpu_array, key=None, skip_key=None):
+    per_cpu = ""
+
+    if key is not None:
+        total = cpu_array.sum(key).value
+        if total > 0:
+            for cpu, value in enumerate(cpu_array.getvalue(key)):
+                if value == 0:
+                    continue
+
+                per_cpu += " {}: {},".format(cpu, value)
+    else:
+        total = 0
+        total_cpu = None
+
+        for key in cpu_array.keys():
+            # I've seen extrem odd hight values, to avoid those, exclude any
+            # index higher thant 1024 for now.
+            if (not (0 <= key.value <= 1024)) or (skip_key is not None and
+                                                  skip_key.value == key.value):
+                continue
+
+            if total_cpu is None:
+                total_cpu = [0] * len(cpu_array.getvalue(key))
+
+            for cpu, value in enumerate(cpu_array.getvalue(key)):
+                total_cpu[cpu] += value
+                total += value
+
+        if total >= 0:
+            for cpu, value in enumerate(total_cpu):
+                if value == 0:
+                    continue
+
+                per_cpu += " {}: {},".format(cpu, value)
+
+    return str(total), per_cpu.strip(", ")
+
+
+#
+# Display kernel upcall statistics
+#
+def display_upcall_results():
+    upcalls = bpf.get_table("upcall_count")
+    have_upcalls = False
+
+    for k in upcalls.keys():
+        if not (0 <= k.value <= 255) or upcalls.sum(k).value == 0:
+            continue
+        have_upcalls = True
+        break
+
+    if not have_upcalls:
+        return
+
+    print("\n\n# UPCALL STATISTICS:\n  Total upcalls       : {} [{}]".format(
+        *format_per_cpu_array(upcalls)))
+
+    for k in upcalls.keys():
+        error = k.value
+        total, per_cpu = format_per_cpu_array(upcalls, key=k)
+
+        if error != 0 and total == "0":
+            continue
+
+        if error == 0:
+            print("  Successfull upcalls : {} [{}]".format(total, per_cpu))
+
+            total, per_cpu = format_per_cpu_array(upcalls, skip_key=k)
+            print("  Failed upcalls      : {} [{}]".format(total, per_cpu))
+        else:
+            print("    {:3}, {:13}: {} [{}]".format(error,
+                                                    get_errno_short(error),
+                                                    total, per_cpu))
+
+
 #
 # process_results()
 #
@@ -1074,7 +1188,12 @@  def process_results(syscall_events=None, trigger_delta=None):
                 indent, "TOTAL:", "", total_count, total_ns))
 
     #
-    # Print events
+    # Print upcall statistics
+    #
+    display_upcall_results()
+
+    #
+    # Print syscall events
     #
     lost_stack_traces = 0
     if syscall_events:
@@ -1194,6 +1313,9 @@  def main():
     parser.add_argument("--skip-syscall-poll-events",
                         help="Skip poll() syscalls with --syscall-events",
                         action="store_true")
+    parser.add_argument("--skip-upcall-stats",
+                        help="Skip the collection of upcall statistics",
+                        action="store_true")
     parser.add_argument("--stack-trace-size",
                         help="Number of unique stack traces that can be "
                         "recorded, default 4096. 0 to disable",
@@ -1298,6 +1420,9 @@  def main():
     source = source.replace("<STACK_TRACE_ENABLED>", "true"
                             if options.stack_trace_size > 0 else "false")
 
+    source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
+                            if options.skip_upcall_stats else "1")
+
     #
     # Handle start/stop probes
     #
diff --git a/utilities/usdt-scripts/kernel_delay.rst b/utilities/usdt-scripts/kernel_delay.rst
index 0f6f916a7..41620d760 100644
--- a/utilities/usdt-scripts/kernel_delay.rst
+++ b/utilities/usdt-scripts/kernel_delay.rst
@@ -109,6 +109,7 @@  followed by resource-specific data. Which are:
 - ``THREAD STOPPED STATISTICS``
 - ``HARD IRQ STATISTICS``
 - ``SOFT IRQ STATISTICS``
+- ``UPCALL STATISTICS``
 
 The following sections will describe in detail what statistics they report.
 
@@ -187,6 +188,14 @@  number of interrupts (``COUNT``), the total time spent in the interrupt
 handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
 
 
+``UPCALL STATISTICS``
+~~~~~~~~~~~~~~~~~~~~~
+The ``UPCALL STATISTICS`` section provides information on the number of
+upcalls sent by the kernel to userspace. If any upcalls fail to be sent,
+the specific return codes are recorded. Statistics are presented both as
+a total and on a per-CPU basis.
+
+
 The ``--syscall-events`` option
 -------------------------------
 In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,