@@ -23,6 +23,8 @@ scripts_DATA += utilities/ovs-lib
usdt_SCRIPTS += \
utilities/usdt-scripts/bridge_loop.bt \
utilities/usdt-scripts/dpif_nl_exec_monitor.py \
+ utilities/usdt-scripts/kernel_delay.py \
+ utilities/usdt-scripts/kernel_delay.rst \
utilities/usdt-scripts/reval_monitor.py \
utilities/usdt-scripts/upcall_cost.py \
utilities/usdt-scripts/upcall_monitor.py
@@ -70,6 +72,8 @@ EXTRA_DIST += \
utilities/docker/debian/build-kernel-modules.sh \
utilities/usdt-scripts/bridge_loop.bt \
utilities/usdt-scripts/dpif_nl_exec_monitor.py \
+ utilities/usdt-scripts/kernel_delay.py \
+ utilities/usdt-scripts/kernel_delay.rst \
utilities/usdt-scripts/reval_monitor.py \
utilities/usdt-scripts/upcall_cost.py \
utilities/usdt-scripts/upcall_monitor.py
new file mode 100755
@@ -0,0 +1,1402 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2022,2023 Red Hat, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Script information:
+# -------------------
+# This script allows a developer to quickly identify if the issue at hand
+# might be related to the kernel running out of resources or if it really is
+# an Open vSwitch issue.
+#
+# For documentation see the kernel_delay.rst file.
+#
+import argparse
+import datetime
+import os
+import pytz
+import psutil
+import re
+import sys
+import time
+
+import ctypes as ct
+
+try:
+ from bcc import BPF, USDT, USDTException
+ from bcc.syscall import syscalls, syscall_name
+except ModuleNotFoundError:
+ print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!")
+ sys.exit(os.EX_OSFILE)
+
+from enum import IntEnum
+
+
+#
+# Actual eBPF source code
+#
+EBPF_SOURCE = """
+#include <linux/irq.h>
+#include <linux/sched.h>
+
+#define MONITOR_PID <MONITOR_PID>
+
+enum {
+<EVENT_ENUM>
+};
+
+struct event_t {
+ u64 ts;
+ u32 tid;
+ u32 id;
+
+ int user_stack_id;
+ int kernel_stack_id;
+
+ u32 syscall;
+ u64 entry_ts;
+
+};
+
+BPF_RINGBUF_OUTPUT(events, <BUFFER_PAGE_CNT>);
+BPF_STACK_TRACE(stack_traces, <STACK_TRACE_SIZE>);
+BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1);
+BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1);
+
+BPF_ARRAY(capture_on, u64, 1);
+static bool capture_enabled(u64 pid_tgid) {
+ int key = 0;
+ u64 *ret;
+
+ if ((pid_tgid >> 32) != MONITOR_PID)
+ return false;
+
+ ret = capture_on.lookup(&key);
+ return ret && *ret == 1;
+}
+
+static inline bool capture_enabled__() {
+ int key = 0;
+ u64 *ret;
+
+ ret = capture_on.lookup(&key);
+ return ret && *ret == 1;
+}
+
+
+static struct event_t *get_event(uint32_t id) {
+ struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t));
+
+ if (!event) {
+ dropcnt.increment(0);
+ return NULL;
+ }
+
+ event->id = id;
+ event->ts = bpf_ktime_get_ns();
+ event->tid = bpf_get_current_pid_tgid();
+
+ return event;
+}
+
+static int start_trigger() {
+ int key = 0;
+ u64 *val = capture_on.lookup(&key);
+
+ /* If the value is -1 we can't start as we are still processing the
+ * results in userspace. */
+ if (!val || *val != 0) {
+ trigger_miss.increment(0);
+ return 0;
+ }
+
+ struct event_t *event = get_event(EVENT_START_TRIGGER);
+ if (event) {
+ events.ringbuf_submit(event, 0);
+ *val = 1;
+ } else {
+ trigger_miss.increment(0);
+ }
+ return 0;
+}
+
+static int stop_trigger() {
+ int key = 0;
+ u64 *val = capture_on.lookup(&key);
+
+ if (!val || *val != 1)
+ return 0;
+
+ struct event_t *event = get_event(EVENT_STOP_TRIGGER);
+
+ if (event)
+ events.ringbuf_submit(event, 0);
+
+ if (val)
+ *val = -1;
+
+ return 0;
+}
+
+<START_TRIGGER>
+<STOP_TRIGGER>
+
+
+/*
+ * For the syscall monitor the following probes get installed.
+ */
+struct syscall_data_t {
+ u64 count;
+ u64 total_ns;
+ u64 worst_ns;
+};
+
+struct syscall_data_key_t {
+ u32 pid;
+ u32 tid;
+ u32 syscall;
+};
+
+BPF_HASH(syscall_start, u64, u64);
+BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t);
+
+TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
+ u64 pid_tgid = bpf_get_current_pid_tgid();
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ u64 t = bpf_ktime_get_ns();
+ syscall_start.update(&pid_tgid, &t);
+
+ return 0;
+}
+
+TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
+ struct syscall_data_t *val, zero = {};
+ struct syscall_data_key_t key;
+
+ u64 pid_tgid = bpf_get_current_pid_tgid();
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ key.pid = pid_tgid >> 32;
+ key.tid = (u32)pid_tgid;
+ key.syscall = args->id;
+
+ u64 *start_ns = syscall_start.lookup(&pid_tgid);
+
+ if (!start_ns)
+ return 0;
+
+ val = syscall_data.lookup_or_try_init(&key, &zero);
+ if (val) {
+ u64 delta = bpf_ktime_get_ns() - *start_ns;
+ val->count++;
+ val->total_ns += delta;
+ if (val->worst_ns == 0 || delta > val->worst_ns)
+ val->worst_ns = delta;
+
+ if (<SYSCAL_TRACE_EVENTS>) {
+ struct event_t *event = get_event(EVENT_SYSCALL);
+ if (event) {
+ event->syscall = args->id;
+ event->entry_ts = *start_ns;
+ if (<STACK_TRACE_ENABLED>) {
+ event->user_stack_id = stack_traces.get_stackid(
+ args, BPF_F_USER_STACK);
+ event->kernel_stack_id = stack_traces.get_stackid(
+ args, 0);
+ }
+ events.ringbuf_submit(event, 0);
+ }
+ }
+ }
+ return 0;
+}
+
+
+
+/*
+ * For measuring the thread run time, we need the following.
+ */
+struct run_time_data_t {
+ u64 count;
+ u64 total_ns;
+ u64 max_ns;
+ u64 min_ns;
+};
+
+struct pid_tid_key_t {
+ u32 pid;
+ u32 tid;
+};
+
+BPF_HASH(run_start, u64, u64);
+BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t);
+
+static inline void thread_start_run(u64 pid_tgid, u64 ktime)
+{
+ run_start.update(&pid_tgid, &ktime);
+}
+
+static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime)
+{
+ u64 pid_tgid = (u64) tgid << 32 | pid;
+ u64 *start_ns = run_start.lookup(&pid_tgid);
+
+ if (!start_ns || *start_ns == 0)
+ return;
+
+ struct run_time_data_t *val, zero = {};
+ struct pid_tid_key_t key = { .pid = tgid,
+ .tid = pid };
+
+ val = run_data.lookup_or_try_init(&key, &zero);
+ if (val) {
+ u64 delta = ktime - *start_ns;
+ val->count++;
+ val->total_ns += delta;
+ if (val->max_ns == 0 || delta > val->max_ns)
+ val->max_ns = delta;
+ if (val->min_ns == 0 || delta < val->min_ns)
+ val->min_ns = delta;
+ }
+ *start_ns = 0;
+}
+
+
+/*
+ * For measuring the thread-ready delay, we need the following.
+ */
+struct ready_data_t {
+ u64 count;
+ u64 total_ns;
+ u64 worst_ns;
+};
+
+BPF_HASH(ready_start, u64, u64);
+BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t);
+
+static inline int sched_wakeup__(u32 pid, u32 tgid)
+{
+ u64 pid_tgid = (u64) tgid << 32 | pid;
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ u64 t = bpf_ktime_get_ns();
+ ready_start.update(&pid_tgid, &t);
+ return 0;
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup)
+{
+ struct task_struct *t = (struct task_struct *)ctx->args[0];
+ return sched_wakeup__(t->pid, t->tgid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_wakeup_new)
+{
+ struct task_struct *t = (struct task_struct *)ctx->args[0];
+ return sched_wakeup__(t->pid, t->tgid);
+}
+
+RAW_TRACEPOINT_PROBE(sched_switch)
+{
+ struct task_struct *prev = (struct task_struct *)ctx->args[1];
+ struct task_struct *next= (struct task_struct *)ctx->args[2];
+ u64 ktime = 0;
+
+ if (!capture_enabled__())
+ return 0;
+
+ if (prev-><STATE_FIELD> == TASK_RUNNING && prev->tgid == MONITOR_PID)
+ sched_wakeup__(prev->pid, prev->tgid);
+
+ if (prev->tgid == MONITOR_PID) {
+ ktime = bpf_ktime_get_ns();
+ thread_stop_run(prev->pid, prev->tgid, ktime);
+ }
+
+ u64 pid_tgid = (u64)next->tgid << 32 | next->pid;
+
+ if (next->tgid != MONITOR_PID)
+ return 0;
+
+ if (ktime == 0)
+ ktime = bpf_ktime_get_ns();
+
+ u64 *start_ns = ready_start.lookup(&pid_tgid);
+
+ if (start_ns && *start_ns != 0) {
+
+ struct ready_data_t *val, zero = {};
+ struct pid_tid_key_t key = { .pid = next->tgid,
+ .tid = next->pid };
+
+ val = ready_data.lookup_or_try_init(&key, &zero);
+ if (val) {
+ u64 delta = ktime - *start_ns;
+ val->count++;
+ val->total_ns += delta;
+ if (val->worst_ns == 0 || delta > val->worst_ns)
+ val->worst_ns = delta;
+ }
+ *start_ns = 0;
+ }
+
+ thread_start_run(pid_tgid, ktime);
+ return 0;
+}
+
+/*
+ * For measuring the hard irq time, we need the following.
+ */
+struct hardirq_start_data_t {
+ u64 start_ns;
+ char irq_name[32];
+};
+
+struct hardirq_data_t {
+ u64 count;
+ u64 total_ns;
+ u64 worst_ns;
+};
+
+struct hardirq_data_key_t {
+ u32 pid;
+ u32 tid;
+ char irq_name[32];
+};
+
+BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t);
+BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t);
+
+TRACEPOINT_PROBE(irq, irq_handler_entry)
+{
+ u64 pid_tgid = bpf_get_current_pid_tgid();
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ struct hardirq_start_data_t data = {};
+
+ data.start_ns = bpf_ktime_get_ns();
+ TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name));
+ hardirq_start.update(&pid_tgid, &data);
+ return 0;
+}
+
+TRACEPOINT_PROBE(irq, irq_handler_exit)
+{
+ u64 pid_tgid = bpf_get_current_pid_tgid();
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ struct hardirq_start_data_t *data;
+ data = hardirq_start.lookup(&pid_tgid);
+ if (!data || data->start_ns == 0)
+ return 0;
+
+ if (args->ret != IRQ_NONE) {
+ struct hardirq_data_t *val, zero = {};
+ struct hardirq_data_key_t key = { .pid = pid_tgid >> 32,
+ .tid = (u32)pid_tgid };
+
+ bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name),
+ data->irq_name);
+ val = hardirq_data.lookup_or_try_init(&key, &zero);
+ if (val) {
+ u64 delta = bpf_ktime_get_ns() - data->start_ns;
+ val->count++;
+ val->total_ns += delta;
+ if (val->worst_ns == 0 || delta > val->worst_ns)
+ val->worst_ns = delta;
+ }
+ }
+
+ data->start_ns = 0;
+ return 0;
+}
+
+/*
+ * For measuring the soft irq time, we need the following.
+ */
+struct softirq_start_data_t {
+ u64 start_ns;
+ u32 vec_nr;
+};
+
+struct softirq_data_t {
+ u64 count;
+ u64 total_ns;
+ u64 worst_ns;
+};
+
+struct softirq_data_key_t {
+ u32 pid;
+ u32 tid;
+ u32 vec_nr;
+};
+
+BPF_HASH(softirq_start, u64, struct softirq_start_data_t);
+BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t);
+
+TRACEPOINT_PROBE(irq, softirq_entry)
+{
+ u64 pid_tgid = bpf_get_current_pid_tgid();
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ struct softirq_start_data_t data = {};
+
+ data.start_ns = bpf_ktime_get_ns();
+ data.vec_nr = args->vec;
+ softirq_start.update(&pid_tgid, &data);
+ return 0;
+}
+
+TRACEPOINT_PROBE(irq, softirq_exit)
+{
+ u64 pid_tgid = bpf_get_current_pid_tgid();
+
+ if (!capture_enabled(pid_tgid))
+ return 0;
+
+ struct softirq_start_data_t *data;
+ data = softirq_start.lookup(&pid_tgid);
+ if (!data || data->start_ns == 0)
+ return 0;
+
+ struct softirq_data_t *val, zero = {};
+ struct softirq_data_key_t key = { .pid = pid_tgid >> 32,
+ .tid = (u32)pid_tgid,
+ .vec_nr = data->vec_nr};
+
+ val = softirq_data.lookup_or_try_init(&key, &zero);
+ if (val) {
+ u64 delta = bpf_ktime_get_ns() - data->start_ns;
+ val->count++;
+ val->total_ns += delta;
+ if (val->worst_ns == 0 || delta > val->worst_ns)
+ val->worst_ns = delta;
+ }
+
+ data->start_ns = 0;
+ return 0;
+}
+"""
+
+
+#
+# time_ns()
+#
+try:
+ from time import time_ns
+except ImportError:
+ # For compatibility with Python <= v3.6.
+ def time_ns():
+ now = datetime.datetime.now()
+ return int(now.timestamp() * 1e9)
+
+
+#
+# Probe class to use for the start/stop triggers
+#
+class Probe(object):
+ '''
+ The goal for this object is to support as many as possible
+ probe/events as supported by BCC. See
+ https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
+ '''
+ def __init__(self, probe, pid=None):
+ self.pid = pid
+ self.text_probe = probe
+ self._parse_text_probe()
+
+ def __str__(self):
+ if self.probe_type == "usdt":
+ return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
+ self.usdt_provider, self.usdt_probe)
+ elif self.probe_type == "trace":
+ return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
+ self.trace_system, self.trace_event)
+ elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
+ return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
+ self.kprobe_function)
+ elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
+ return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
+ self.uprobe_function)
+ else:
+ return "[{}] <{}:unknown probe>".format(self.text_probe,
+ self.probe_type)
+
+ def _raise(self, error):
+ raise ValueError("[{}]; {}".format(self.text_probe, error))
+
+ def _verify_kprobe_probe(self):
+ # Nothing to verify for now, just return.
+ return
+
+ def _verify_trace_probe(self):
+ # Nothing to verify for now, just return.
+ return
+
+ def _verify_uprobe_probe(self):
+ # Nothing to verify for now, just return.
+ return
+
+ def _verify_usdt_probe(self):
+ if not self.pid:
+ self._raise("USDT probes need a valid PID.")
+
+ usdt = USDT(pid=self.pid)
+
+ for probe in usdt.enumerate_probes():
+ if probe.provider.decode('utf-8') == self.usdt_provider and \
+ probe.name.decode('utf-8') == self.usdt_probe:
+ return
+
+ self._raise("Can't find UDST probe '{}:{}'".format(self.usdt_provider,
+ self.usdt_probe))
+
+ def _parse_text_probe(self):
+ '''
+ The text probe format is defined as follows:
+ <probe_type>:<probe_specific>
+
+ Types:
+ USDT: u|usdt:<provider>:<probe>
+ TRACE: t|trace:<system>:<event>
+ KPROBE: k|kprobe:<kernel_function>
+ KRETPROBE: kr|kretprobe:<kernel_function>
+ UPROBE: up|uprobe:<function>
+ URETPROBE: ur|uretprobe:<function>
+ '''
+ args = self.text_probe.split(":")
+ if len(args) <= 1:
+ self._raise("Can't extract probe type.")
+
+ if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace",
+ "u", "usdt", "up", "uprobe", "ur", "uretprobe"]:
+ self._raise("Invalid probe type '{}'".format(args[0]))
+
+ self.probe_type = "kprobe" if args[0] == "k" else args[0]
+ self.probe_type = "kretprobe" if args[0] == "kr" else self.probe_type
+ self.probe_type = "trace" if args[0] == "t" else self.probe_type
+ self.probe_type = "usdt" if args[0] == "u" else self.probe_type
+ self.probe_type = "uprobe" if args[0] == "up" else self.probe_type
+ self.probe_type = "uretprobe" if args[0] == "ur" else self.probe_type
+
+ if self.probe_type == "usdt":
+ if len(args) != 3:
+ self._raise("Invalid number of arguments for USDT")
+
+ self.usdt_provider = args[1]
+ self.usdt_probe = args[2]
+ self._verify_usdt_probe()
+
+ elif self.probe_type == "trace":
+ if len(args) != 3:
+ self._raise("Invalid number of arguments for TRACE")
+
+ self.trace_system = args[1]
+ self.trace_event = args[2]
+ self._verify_trace_probe()
+
+ elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
+ if len(args) != 2:
+ self._raise("Invalid number of arguments for K(RET)PROBE")
+ self.kprobe_function = args[1]
+ self._verify_kprobe_probe()
+
+ elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
+ if len(args) != 2:
+ self._raise("Invalid number of arguments for U(RET)PROBE")
+ self.uprobe_function = args[1]
+ self._verify_uprobe_probe()
+
+ def _get_kprobe_c_code(self, function_name, function_content):
+ #
+ # The kprobe__* do not require a function name, so it's
+ # ignored in the code generation.
+ #
+ return """
+int {}__{}(struct pt_regs *ctx) {{
+ {}
+}}
+""".format(self.probe_type, self.kprobe_function, function_content)
+
+ def _get_trace_c_code(self, function_name, function_content):
+ #
+ # The TRACEPOINT_PROBE() do not require a function name, so it's
+ # ignored in the code generation.
+ #
+ return """
+TRACEPOINT_PROBE({},{}) {{
+ {}
+}}
+""".format(self.trace_system, self.trace_event, function_content)
+
+ def _get_uprobe_c_code(self, function_name, function_content):
+ return """
+int {}(struct pt_regs *ctx) {{
+ {}
+}}
+""".format(function_name, function_content)
+
+ def _get_usdt_c_code(self, function_name, function_content):
+ return """
+int {}(struct pt_regs *ctx) {{
+ {}
+}}
+""".format(function_name, function_content)
+
+ def get_c_code(self, function_name, function_content):
+ if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe':
+ return self._get_kprobe_c_code(function_name, function_content)
+ elif self.probe_type == 'trace':
+ return self._get_trace_c_code(function_name, function_content)
+ elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe':
+ return self._get_uprobe_c_code(function_name, function_content)
+ elif self.probe_type == 'usdt':
+ return self._get_usdt_c_code(function_name, function_content)
+
+ return ""
+
+ def probe_name(self):
+ if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe':
+ return "{}".format(self.kprobe_function)
+ elif self.probe_type == 'trace':
+ return "{}:{}".format(self.trace_system,
+ self.trace_event)
+ elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe':
+ return "{}".format(self.uprobe_function)
+ elif self.probe_type == 'usdt':
+ return "{}:{}".format(self.usdt_provider,
+ self.usdt_probe)
+
+ return ""
+
+
+#
+# event_to_dict()
+#
+def event_to_dict(event):
+ return dict([(field, getattr(event, field))
+ for (field, _) in event._fields_
+ if isinstance(getattr(event, field), (int, bytes))])
+
+
+#
+# Event enum
+#
+Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER",
+ ], start=0)
+
+
+#
+# process_event()
+#
+def process_event(ctx, data, size):
+ global start_trigger_ts
+ global stop_trigger_ts
+
+ event = bpf['events'].event(data)
+ if event.id == Event.SYSCALL:
+ syscall_events.append({"tid": event.tid,
+ "ts_entry": event.entry_ts,
+ "ts_exit": event.ts,
+ "syscall": event.syscall,
+ "user_stack_id": event.user_stack_id,
+ "kernel_stack_id": event.kernel_stack_id})
+ elif event.id == Event.START_TRIGGER:
+ #
+ # This event would have start the trigger already, so all we need to
+ # do is record the start timestamp.
+ #
+ start_trigger_ts = event.ts
+
+ elif event.id == Event.STOP_TRIGGER:
+ #
+ # This event would have stopped the trigger already, so all we need to
+ # do is record the start timestamp.
+ stop_trigger_ts = event.ts
+
+
+#
+# next_power_of_two()
+#
+def next_power_of_two(val):
+ np = 1
+ while np < val:
+ np *= 2
+ return np
+
+
+#
+# unsigned_int()
+#
+def unsigned_int(value):
+ try:
+ value = int(value)
+ except ValueError:
+ raise argparse.ArgumentTypeError("must be an integer")
+
+ if value < 0:
+ raise argparse.ArgumentTypeError("must be positive")
+ return value
+
+
+#
+# unsigned_nonzero_int()
+#
+def unsigned_nonzero_int(value):
+ value = unsigned_int(value)
+ if value == 0:
+ raise argparse.ArgumentTypeError("must be nonzero")
+ return value
+
+
+#
+# get_thread_name()
+#
+def get_thread_name(pid, tid):
+ try:
+ with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f:
+ return f.readline().strip("\n")
+ except FileNotFoundError:
+ pass
+
+ return f"<unknown:{pid}/{tid}>"
+
+
+#
+# get_vec_nr_name()
+#
+def get_vec_nr_name(vec_nr):
+ known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
+ "tasklet", "sched", "hrtimer", "rcu"]
+
+ if vec_nr < 0 or vec_nr > len(known_vec_nr):
+ return f"<unknown:{vec_nr}>"
+
+ return known_vec_nr[vec_nr]
+
+
+#
+# start/stop/reset capture
+#
+def start_capture():
+ bpf["capture_on"][ct.c_int(0)] = ct.c_int(1)
+
+
+def stop_capture(force=False):
+ if force:
+ bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff)
+ else:
+ bpf["capture_on"][ct.c_int(0)] = ct.c_int(0)
+
+
+def capture_running():
+ return True if bpf["capture_on"][ct.c_int(0)].value == 1 else False
+
+
+def reset_capture():
+ bpf["syscall_start"].clear()
+ bpf["syscall_data"].clear()
+ bpf["run_start"].clear()
+ bpf["run_data"].clear()
+ bpf["ready_start"].clear()
+ bpf["ready_data"].clear()
+ bpf["hardirq_start"].clear()
+ bpf["hardirq_data"].clear()
+ bpf["softirq_start"].clear()
+ bpf["softirq_data"].clear()
+ bpf["stack_traces"].clear()
+
+
+#
+# Display timestamp
+#
+def print_timestamp(msg):
+ ltz = datetime.datetime.now()
+ utc = ltz.astimezone(pytz.utc)
+ time_string = "{} @{} ({} UTC)".format(
+ msg, ltz.isoformat(), utc.strftime("%H:%M:%S"))
+ print(time_string)
+
+
+#
+# process_results()
+#
+def process_results(syscal_events=None, trigger_delta=None):
+ if trigger_delta:
+ print_timestamp("# Triggered sample dump, stop-start delta {:,} ns".
+ format(trigger_delta))
+ else:
+ print_timestamp("# Sample dump")
+
+ #
+ # First get a list of all threads we need to report on.
+ #
+ threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items()
+ if k.syscall != 0xffffffff}
+
+ threads_run = {k.tid for k, _ in bpf["run_data"].items()
+ if k.pid != 0xffffffff}
+
+ threads_ready = {k.tid for k, _ in bpf["ready_data"].items()
+ if k.pid != 0xffffffff}
+
+ threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items()
+ if k.pid != 0xffffffff}
+
+ threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items()
+ if k.pid != 0xffffffff}
+
+ threads = sorted(threads_syscall | threads_run | threads_ready |
+ threads_hardirq | threads_softirq,
+ key=lambda x: get_thread_name(options.pid, x))
+
+ #
+ # Print header...
+ #
+ print("{:10} {:16} {}".format("TID", "THREAD", "<RESOURCE SPECIFIC>"))
+ print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76))
+ indent = 28 * " "
+
+ #
+ # Print all events/statistics per threads.
+ #
+ poll_id = [k for k, v in syscalls.items() if v == b'poll'][0]
+ for thread in threads:
+
+ if thread != threads[0]:
+ print("")
+
+ #
+ # SYSCALL_STATISTICS
+ #
+ print("{:10} {:16} {}\n{}{:20} {:>6} {:>10} {:>16} {:>16}".format(
+ thread, get_thread_name(options.pid, thread),
+ "[SYSCALL STATISTICS]", indent,
+ "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns"))
+
+ total_count = 0
+ total_ns = 0
+ for k, v in sorted(bpf["syscall_data"].items(),
+ key=lambda kv: -kv[1].total_ns):
+ if k.tid != thread:
+ continue
+
+ print("{}{:20.20} {:6} {:10} {:16,} {:16,}".format(
+ indent, syscall_name(k.syscall).decode('utf-8'), k.syscall,
+ v.count, v.total_ns, v.worst_ns))
+ if k.syscall != poll_id:
+ total_count += v.count
+ total_ns += v.total_ns
+
+ if total_count > 0:
+ print("{}{:20.20} {:6} {:10} {:16,}".format(
+ indent, "TOTAL( - poll):", "", total_count, total_ns))
+
+ #
+ # THREAD RUN STATISTICS
+ #
+ print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format(
+ "", "", "[THREAD RUN STATISTICS]", indent,
+ "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns"))
+
+ for k, v in bpf["run_data"].items():
+ if k.tid != thread:
+ continue
+
+ print("{}{:10} {:16,} {:16,} {:16,}".format(
+ indent, v.count, v.total_ns, v.min_ns, v.max_ns))
+
+ #
+ # THREAD READY STATISTICS
+ #
+ print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format(
+ "", "", "[THREAD READY STATISTICS]", indent,
+ "SCHED_CNT", "TOTAL ns", "MAX ns"))
+
+ for k, v in bpf["ready_data"].items():
+ if k.tid != thread:
+ continue
+
+ print("{}{:10} {:16,} {:16,}".format(
+ indent, v.count, v.total_ns, v.worst_ns))
+
+ #
+ # HARD IRQ STATISTICS
+ #
+ total_ns = 0
+ total_count = 0
+ header_printed = False
+ for k, v in sorted(bpf["hardirq_data"].items(),
+ key=lambda kv: -kv[1].total_ns):
+ if k.tid != thread:
+ continue
+
+ if not header_printed:
+ print("\n{:10} {:16} {}\n{}{:20} {:>10} {:>16} {:>16}".
+ format("", "", "[HARD IRQ STATISTICS]", indent,
+ "NAME", "COUNT", "TOTAL ns", "MAX ns"))
+ header_printed = True
+
+ print("{}{:20.20} {:10} {:16,} {:16,}".format(
+ indent, k.irq_name.decode('utf-8'),
+ v.count, v.total_ns, v.worst_ns))
+
+ total_count += v.count
+ total_ns += v.total_ns
+
+ if total_count > 0:
+ print("{}{:20.20} {:10} {:16,}".format(
+ indent, "TOTAL:", total_count, total_ns))
+
+ #
+ # SOFT IRQ STATISTICS
+ #
+ total_ns = 0
+ total_count = 0
+ header_printed = False
+ for k, v in sorted(bpf["softirq_data"].items(),
+ key=lambda kv: -kv[1].total_ns):
+ if k.tid != thread:
+ continue
+
+ if not header_printed:
+ print("\n{:10} {:16} {}\n"
+ "{}{:20} {:>7} {:>10} {:>16} {:>16}".
+ format("", "", "[SOFT IRQ STATISTICS]", indent,
+ "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX ns"))
+ header_printed = True
+
+ print("{}{:20.20} {:>7} {:10} {:16,} {:16,}".format(
+ indent, get_vec_nr_name(k.vec_nr), k.vec_nr,
+ v.count, v.total_ns, v.worst_ns))
+
+ total_count += v.count
+ total_ns += v.total_ns
+
+ if total_count > 0:
+ print("{}{:20.20} {:7} {:10} {:16,}".format(
+ indent, "TOTAL:", "", total_count, total_ns))
+
+ #
+ # Print events
+ #
+ lost_stack_traces = 0
+ if syscall_events is not None and len(syscall_events) > 0:
+ stack_traces = bpf.get_table("stack_traces")
+
+ print("\n\n# SYSCALL EVENTS:"
+ "\n{}{:>19} {:>19} {:>10} {:16} {:>10} {}".format(
+ 2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM",
+ "DELTA (us)", "SYSCALL"))
+ print("{}{:19} {:19} {:10} {:16} {:10} {}".format(
+ 2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16,
+ "-" * 10, "-" * 16))
+ for event in syscall_events:
+ print("{}{:19} {:19} {:10} {:16} {:10,} {}".format(
+ " " * 2,
+ event["ts_entry"], event["ts_exit"], event["tid"],
+ get_thread_name(options.pid, event["tid"]),
+ int((event["ts_exit"] - event["ts_entry"]) / 1000),
+ syscall_name(event["syscall"]).decode('utf-8')))
+ #
+ # Not sure where to put this, but I'll add some info on stack
+ # traces here... Userspace stack traces are very limited due to
+ # the fact that bcc does not support dwarf backtraces. As OVS
+ # gets compiled without frame pointers we will not see much.
+ # If however, OVS does get built with frame pointers, we should not
+ # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle
+ # the debug symbols correctly. Also, note that for kernel
+ # traces you should not use BPF_STACK_TRACE_BUILDID, so two
+ # buffers are needed.
+ #
+ # Some info on manual dwarf walk support:
+ # https://github.com/iovisor/bcc/issues/3515
+ # https://github.com/iovisor/bcc/pull/4463
+ #
+ if options.stack_trace_size == 0:
+ continue
+
+ if event['kernel_stack_id'] < 0 or event['user_stack_id'] < 0:
+ lost_stack_traces += 1
+
+ kernel_stack = stack_traces.walk(event['kernel_stack_id']) \
+ if event['kernel_stack_id'] >= 0 else []
+ user_stack = stack_traces.walk(event['user_stack_id']) \
+ if event['user_stack_id'] >= 0 else []
+
+ for addr in kernel_stack:
+ print("{}{}".format(
+ " " * 10,
+ bpf.ksym(addr, show_module=True,
+ show_offset=True).decode('utf-8', 'replace')))
+
+ for addr in user_stack:
+ addr_str = bpf.sym(addr, options.pid, show_module=True,
+ show_offset=True).decode('utf-8', 'replace')
+
+ if addr_str == "[unknown]":
+ addr_str += " 0x{:x}".format(addr)
+
+ print("{}{}".format(" " * 10, addr_str))
+
+ #
+ # Print any footer messages.
+ #
+ if lost_stack_traces > 0:
+ print("\n#WARNING: We where not able to display {} stack traces!\n"
+ "# Consider increasing the stack trace size using\n"
+ "# the '--stack-trace-size' option.\n"
+ "# Note that this can also happen due to a stack id\n"
+ "# collision.".format(lost_stack_traces))
+
+
+#
+# main()
+#
+def main():
+ #
+ # Don't like these globals, but ctx passing does not seem to work with the
+ # existing open_ring_buffer() API :(
+ #
+ global bpf
+ global options
+ global syscall_events
+ global start_trigger_ts
+ global stop_trigger_ts
+
+ start_trigger_ts = 0
+ stop_trigger_ts = 0
+
+ #
+ # Argument parsing
+ #
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument("-D", "--debug",
+ help="Enable eBPF debugging",
+ type=int, const=0x3f, default=0, nargs='?')
+ parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID",
+ help="ovs-vswitch's PID",
+ type=unsigned_int, default=None)
+ parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS",
+ help="Record syscall events, default disabled",
+ type=unsigned_int, const=0, default=None, nargs='?')
+ parser.add_argument("--buffer-page-count",
+ help="Number of BPF ring buffer pages, default 1024",
+ type=unsigned_int, default=1024, metavar="NUMBER")
+ parser.add_argument("--sample-count",
+ help="Number of sample runs, default 1",
+ type=unsigned_nonzero_int, default=1, metavar="RUNS")
+ parser.add_argument("--sample-interval",
+ help="Delay between sample runs, default 0",
+ type=float, default=0, metavar="SECONDS")
+ parser.add_argument("--sample-time",
+ help="Sample time, default 0.5 seconds",
+ type=float, default=0.5, metavar="SECONDS")
+ parser.add_argument("--skip-syscall-poll-events",
+ help="Skip poll() syscalls with --syscall-events",
+ action="store_true")
+ parser.add_argument("--stack-trace-size",
+ help="Number of unique stack traces that can be "
+ "recorded, default 4096. 0 to disable",
+ type=unsigned_int, default=4096)
+ parser.add_argument("--start-trigger", metavar="TRIGGER",
+ help="Start trigger, see documentation for details",
+ type=str, default=None)
+ parser.add_argument("--stop-trigger", metavar="TRIGGER",
+ help="Stop trigger, see documentation for details",
+ type=str, default=None)
+ parser.add_argument("--trigger-delta", metavar="DURATION_NS",
+ help="Only report event when the trigger duration > "
+ "DURATION_NS, default 0 (all events)",
+ type=unsigned_int, const=0, default=0, nargs='?')
+
+ options = parser.parse_args()
+
+ #
+ # Find the PID of the ovs-vswitchd daemon if not specified.
+ #
+ if options.pid is None:
+ for proc in psutil.process_iter():
+ if 'ovs-vswitchd' in proc.name():
+ if options.pid is not None:
+ print("ERROR: Multiple ovs-vswitchd daemons running, "
+ "use the -p option!")
+ sys.exit(os.EX_NOINPUT)
+
+ options.pid = proc.pid
+
+ #
+ # Error checking on input parameters.
+ #
+ if options.pid is None:
+ print("ERROR: Failed to find ovs-vswitchd's PID!")
+ sys.exit(os.EX_UNAVAILABLE)
+
+ options.buffer_page_count = next_power_of_two(options.buffer_page_count)
+
+ #
+ # Make sure we are running as root, or else we can not attach the probes.
+ #
+ if os.geteuid() != 0:
+ print("ERROR: We need to run as root to attached probes!")
+ sys.exit(os.EX_NOPERM)
+
+ #
+ # Setup any of the start stop triggers
+ #
+ if options.start_trigger is not None:
+ try:
+ start_trigger = Probe(options.start_trigger, pid=options.pid)
+ except ValueError as e:
+ print(f"ERROR: Invalid start trigger {str(e)}")
+ sys.exit(os.EX_CONFIG)
+ else:
+ start_trigger = None
+
+ if options.stop_trigger is not None:
+ try:
+ stop_trigger = Probe(options.stop_trigger, pid=options.pid)
+ except ValueError as e:
+ print(f"ERROR: Invalid stop trigger {str(e)}")
+ sys.exit(os.EX_CONFIG)
+ else:
+ stop_trigger = None
+
+ #
+ # Attach probe to running process.
+ #
+ source = EBPF_SOURCE.replace("<EVENT_ENUM>", "\n".join(
+ [" EVENT_{} = {},".format(
+ event.name, event.value) for event in Event]))
+ source = source.replace("<BUFFER_PAGE_CNT>",
+ str(options.buffer_page_count))
+ source = source.replace("<MONITOR_PID>", str(options.pid))
+
+ if BPF.kernel_struct_has_field(b'task_struct', b'state') == 1:
+ source = source.replace('<STATE_FIELD>', 'state')
+ else:
+ source = source.replace('<STATE_FIELD>', '__state')
+
+ poll_id = [k for k, v in syscalls.items() if v == b'poll'][0]
+ if options.syscall_events is None:
+ syscall_trace_events = "false"
+ elif options.syscall_events == 0:
+ if not options.skip_syscall_poll_events:
+ syscall_trace_events = "true"
+ else:
+ syscall_trace_events = f"args->id != {poll_id}"
+ else:
+ syscall_trace_events = "delta > {}".format(options.syscall_events)
+ if options.skip_syscall_poll_events:
+ syscall_trace_events += f" && args->id != {poll_id}"
+
+ source = source.replace("<SYSCAL_TRACE_EVENTS>",
+ syscall_trace_events)
+
+ source = source.replace("<STACK_TRACE_SIZE>",
+ str(options.stack_trace_size))
+
+ source = source.replace("<STACK_TRACE_ENABLED>", "true"
+ if options.stack_trace_size > 0 else "false")
+
+ #
+ # Handle start/stop probes
+ #
+ if start_trigger:
+ source = source.replace("<START_TRIGGER>",
+ start_trigger.get_c_code(
+ "start_trigger_probe",
+ "return start_trigger();"))
+ else:
+ source = source.replace("<START_TRIGGER>", "")
+
+ if stop_trigger:
+ source = source.replace("<STOP_TRIGGER>",
+ stop_trigger.get_c_code(
+ "stop_trigger_probe",
+ "return stop_trigger();"))
+ else:
+ source = source.replace("<STOP_TRIGGER>", "")
+
+ #
+ # Setup usdt or other probes that need handling trough the BFP class.
+ #
+ usdt = USDT(pid=int(options.pid))
+ try:
+ if start_trigger and start_trigger.probe_type == 'usdt':
+ usdt.enable_probe(probe=start_trigger.probe_name(),
+ fn_name="start_trigger_probe")
+ if stop_trigger and stop_trigger.probe_type == 'usdt':
+ usdt.enable_probe(probe=stop_trigger.probe_name(),
+ fn_name="stop_trigger_probe")
+
+ except USDTException as e:
+ print("ERROR: {}".format(
+ (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip().
+ replace("--with-dtrace or --enable-dtrace",
+ "--enable-usdt-probes")))
+ sys.exit(os.EX_OSERR)
+
+ bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug)
+
+ if start_trigger:
+ try:
+ if start_trigger.probe_type == "uprobe":
+ bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
+ sym=start_trigger.probe_name(),
+ fn_name="start_trigger_probe",
+ pid=options.pid)
+
+ if start_trigger.probe_type == "uretprobe":
+ bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
+ sym=start_trigger.probe_name(),
+ fn_name="start_trigger_probe",
+ pid=options.pid)
+ except Exception as e:
+ print("ERROR: Failed attaching uprobe start trigger "
+ f"'{start_trigger.probe_name()}';\n {str(e)}")
+ sys.exit(os.EX_OSERR)
+
+ if stop_trigger:
+ try:
+ if stop_trigger.probe_type == "uprobe":
+ bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
+ sym=stop_trigger.probe_name(),
+ fn_name="stop_trigger_probe",
+ pid=options.pid)
+
+ if stop_trigger.probe_type == "uretprobe":
+ bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
+ sym=stop_trigger.probe_name(),
+ fn_name="stop_trigger_probe",
+ pid=options.pid)
+ except Exception as e:
+ print("ERROR: Failed attaching uprobe stop trigger"
+ f"'{stop_trigger.probe_name()}';\n {str(e)}")
+ sys.exit(os.EX_OSERR)
+
+ #
+ # If no triggers are configured use the delay configuration
+ #
+ bpf['events'].open_ring_buffer(process_event)
+
+ sample_count = 0
+ while sample_count < options.sample_count:
+ sample_count += 1
+ syscall_events = []
+
+ if options.start_trigger is None:
+ print_timestamp("# Start sampling")
+ start_capture()
+ stop_time = -1 if options.stop_trigger else \
+ time_ns() + options.sample_time * 1000000000
+ else:
+ # For start triggers the stop time depends on the start trigger
+ # time, or depends on the stop trigger if configured.
+ stop_time = -1 if options.stop_trigger else 0
+
+ while True:
+ try:
+ last_start_ts = start_trigger_ts
+ last_stop_ts = stop_trigger_ts
+
+ if stop_time > 0:
+ delay = int((stop_time - time_ns()) / 1000000)
+ if delay <= 0:
+ break
+ else:
+ delay = -1
+
+ bpf.ring_buffer_poll(timeout=delay)
+
+ if stop_time <= 0 and last_start_ts != start_trigger_ts:
+ print_timestamp(
+ "# Start sampling (trigger@{})".format(
+ start_trigger_ts))
+
+ if not options.stop_trigger:
+ stop_time = time_ns() + \
+ options.sample_time * 1000000000
+
+ if last_stop_ts != stop_trigger_ts:
+ break
+
+ except KeyboardInterrupt:
+ break
+
+ if options.stop_trigger and not capture_running():
+ print_timestamp("# Stop sampling (trigger@{})".format(
+ stop_trigger_ts))
+ else:
+ print_timestamp("# Stop sampling")
+
+ if stop_trigger_ts != 0 and start_trigger_ts != 0:
+ trigger_delta = stop_trigger_ts - start_trigger_ts
+ else:
+ trigger_delta = None
+
+ if not trigger_delta or trigger_delta >= options.trigger_delta:
+ stop_capture(force=True) # Prevent a new trigger to start.
+ process_results(syscal_events=syscall_events,
+ trigger_delta=trigger_delta)
+ elif trigger_delta:
+ sample_count -= 1
+ print_timestamp("# Sample dump skipped, delta {:,} ns".format(
+ trigger_delta))
+
+ reset_capture()
+ stop_capture()
+ if options.sample_interval > 0:
+ time.sleep(options.sample_interval)
+
+ #
+ # Report lost events.
+ #
+ dropcnt = bpf.get_table("dropcnt")
+ for k in dropcnt.keys():
+ count = dropcnt.sum(k).value
+ if k.value == 0 and count > 0:
+ print("\n# WARNING: Not all events were captured, {} were "
+ "dropped!\n# Increase the BPF ring buffer size "
+ "with the --buffer-page-count option.".format(count))
+
+ if (options.sample_count > 1):
+ trigger_miss = bpf.get_table("trigger_miss")
+ for k in trigger_miss.keys():
+ count = trigger_miss.sum(k).value
+ if k.value == 0 and count > 0:
+ print("\n# WARNING: Not all start triggers were successful. "
+ "{} were missed due to\n# slow userspace "
+ "processing!".format(count))
+
+
+#
+# Start main() as the default entry point...
+#
+if __name__ == '__main__':
+ main()
new file mode 100644
@@ -0,0 +1,596 @@
+Troubleshooting Open vSwitch: Is the kernel to blame?
+=====================================================
+Often, when troubleshooting Open vSwitch (OVS) in the field, you might be left
+wondering if the issue is really OVS-related, or if it's a problem with the
+kernel being overloaded. Messages in the log like
+``Unreasonably long XXXXms poll interval`` might suggest it's OVS, but from
+experience, these are mostly related to an overloaded Linux Kernel.
+The kernel_delay.py tool can help you quickly identify if the focus of your
+investigation should be OVS or the Linux kernel.
+
+
+Introduction
+------------
+``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_
+framework to install eBPF probes. The data the eBPF probes collect will be
+analyzed and presented to the user by the Python script. Some of the presented
+data can also be captured by the individual scripts included in the BBC [#BCC]_
+framework.
+
+kernel_delay.py has two modes of operation:
+
+- In **time mode**, the tool runs for a specific time and collects the
+ information.
+- In **trigger mode**, event collection can be started and/or stopped based on
+ a specific eBPF probe. Currently, the following probes are supported:
+ - USDT probes
+ - Kernel tracepoints
+ - kprobe
+ - kretprobe
+ - uprobe
+ - uretprobe
+
+
+In addition, the option, ``--sample-count``, exists to specify how many
+iterations you would like to do. When using triggers, you can also ignore
+samples if they are less than a number of nanoseconds with the
+``--trigger-delta`` option. The latter might be useful when debugging Linux
+syscalls which take a long time to complete. More on this later. Finally, you
+can configure the delay between two sample runs with the ``--sample-interval``
+option.
+
+Before getting into more details, you can run the tool without any options
+to see what the output looks like. Notice that it will try to automatically
+get the process ID of the running ``ovs-vsdwitchd``. You can overwrite this
+with the ``--pid`` option.
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py
+ # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC)
+ # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC)
+ # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC)
+ TID THREAD <RESOURCE SPECIFIC>
+ ---------- ---------------- ----------------------------------------------------------------------------
+ 27090 ovs-vswitchd [SYSCALL STATISTICS]
+ <EDIT: REMOVED DATA FOR ovs-vswitchd THREAD>
+
+ 31741 revalidator122 [SYSCALL STATISTICS]
+ NAME NUMBER COUNT TOTAL ns MAX ns
+ poll 7 5 184,193,176 184,191,520
+ recvmsg 47 494 125,208,756 310,331
+ futex 202 8 18,768,758 4,023,039
+ sendto 44 10 375,861 266,867
+ sendmsg 46 4 43,294 11,213
+ write 1 1 5,949 5,949
+ getrusage 98 1 1,424 1,424
+ read 0 1 1,292 1,292
+ TOTAL( - poll): 519 144,405,334
+
+ [THREAD RUN STATISTICS]
+ SCHED_CNT TOTAL ns MIN ns MAX ns
+ 6 136,764,071 1,480 115,146,424
+
+ [THREAD READY STATISTICS]
+ SCHED_CNT TOTAL ns MAX ns
+ 7 11,334 6,636
+
+ [HARD IRQ STATISTICS]
+ NAME COUNT TOTAL ns MAX ns
+ eno8303-rx-1 1 3,586 3,586
+ TOTAL: 1 3,586
+
+ [SOFT IRQ STATISTICS]
+ NAME VECT_NR COUNT TOTAL ns MAX ns
+ net_rx 3 1 17,699 17,699
+ sched 7 6 13,820 3,226
+ rcu 9 16 13,586 1,554
+ timer 1 3 10,259 3,815
+ TOTAL: 26 55,364
+
+
+By default, the tool will run for half a second in `time mode`. To extend this
+you can use the ``--sample-time`` option.
+
+
+What will it report
+-------------------
+The above sample output separates the captured data on a per-thread basis.
+For this, it displays the thread's id (``TID``) and name (``THREAD``),
+followed by resource-specific data. Which are:
+
+- ``SYSCALL STATISTICS``
+- ``THREAD RUN STATISTICS``
+- ``THREAD READY STATISTICS``
+- ``HARD IRQ STATISTICS``
+- ``SOFT IRQ STATISTICS``
+
+The following sections will describe in detail what statistics they report.
+
+
+``SYSCALL STATISTICS``
+~~~~~~~~~~~~~~~~~~~~~~
+``SYSCALL STATISTICS`` tell you which Linux system calls got executed during
+the measurement interval. This includes the number of times the syscall was
+called (``COUNT``), the total time spent in the system calls (``TOTAL ns``),
+and the worst-case duration of a single call (``MAX ns``).
+
+It also shows the total of all system calls, but it excludes the poll system
+call, as the purpose of this call is to wait for activity on a set of sockets,
+and usually, the thread gets swapped out.
+
+Note that it only counts calls that started and stopped during the
+measurement interval!
+
+
+``THREAD RUN STATISTICS``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU
+during the measurement interval.
+
+Note that these statistics only count events where the thread started and
+stopped running on a CPU during the measurement interval. For example, if
+this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``.
+If not, there might be a misconfiguration.
+
+
+``THREAD READY STATISTICS``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+``THREAD READY STATISTICS`` tell you the time between the thread being ready
+to run and it actually running on the CPU.
+
+Note that these statistics only count events where the thread was getting
+ready to run and started running during the measurement interval.
+
+
+``HARD IRQ STATISTICS``
+~~~~~~~~~~~~~~~~~~~~~~~
+``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard
+interrupts during the threads run time.
+
+It shows the interrupt name (``NAME``), the number of interrupts (``COUNT``),
+the total time spent in the interrupt handler (``TOTAL ns``), and the
+worst-case duration (``MAX ns``).
+
+
+``SOFT IRQ STATISTICS``
+~~~~~~~~~~~~~~~~~~~~~~~
+``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft
+interrupts during the threads run time.
+
+It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the
+number of interrupts (``COUNT``), the total time spent in the interrupt
+handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
+
+
+The ``--syscall-events`` option
+-------------------------------
+In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,
+the tool can also report each individual syscall. This can be a usefull
+second step if the ``SYSCALL_STATISTICS`` show high latency numbers.
+
+All you need to do is add the ``--syscall-events`` option, with or without
+the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter
+allows you to exclude events that take less than the supplied time.
+
+The ``--skip-syscall-poll-events`` option allows you to exclude poll
+syscalls from the report.
+
+Below is an example run, note that the resource-specific data is removed
+to highlight the syscall events:
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --syscall-events 50000 --skip-syscall-poll-events
+ # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC)
+ # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC)
+ # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC)
+ TID THREAD <RESOURCE SPECIFIC>
+ ---------- ---------------- ----------------------------------------------------------------------------
+ 3359686 ipf_clean2 [SYSCALL STATISTICS]
+ ...
+ 3359635 ovs-vswitchd [SYSCALL STATISTICS]
+ ...
+ 3359697 revalidator12 [SYSCALL STATISTICS]
+ ...
+ 3359698 revalidator13 [SYSCALL STATISTICS]
+ ...
+ 3359699 revalidator14 [SYSCALL STATISTICS]
+ ...
+ 3359700 revalidator15 [SYSCALL STATISTICS]
+ ...
+
+ # SYSCALL EVENTS:
+ ENTRY (ns) EXIT (ns) TID COMM DELTA (us) SYSCALL
+ ------------------- ------------------- ---------- ---------------- ---------- ----------------
+ 2161821694935486 2161821695031201 3359699 revalidator14 95 futex
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode+0x9 [kernel]
+ do_syscall_64+0x68 [kernel]
+ entry_SYSCALL_64_after_hwframe+0x72 [kernel]
+ __GI___lll_lock_wait+0x30 [libc.so.6]
+ ovs_mutex_lock_at+0x18 [ovs-vswitchd]
+ [unknown] 0x696c003936313a63
+ 2161821695276882 2161821695333687 3359698 revalidator13 56 futex
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode+0x9 [kernel]
+ do_syscall_64+0x68 [kernel]
+ entry_SYSCALL_64_after_hwframe+0x72 [kernel]
+ __GI___lll_lock_wait+0x30 [libc.so.6]
+ ovs_mutex_lock_at+0x18 [ovs-vswitchd]
+ [unknown] 0x696c003134313a63
+ 2161821695275820 2161821695405733 3359700 revalidator15 129 futex
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode+0x9 [kernel]
+ do_syscall_64+0x68 [kernel]
+ entry_SYSCALL_64_after_hwframe+0x72 [kernel]
+ __GI___lll_lock_wait+0x30 [libc.so.6]
+ ovs_mutex_lock_at+0x18 [ovs-vswitchd]
+ [unknown] 0x696c003936313a63
+ 2161821695964969 2161821696052021 3359635 ovs-vswitchd 87 accept
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode_prepare+0x161 [kernel]
+ syscall_exit_to_user_mode+0x9 [kernel]
+ do_syscall_64+0x68 [kernel]
+ entry_SYSCALL_64_after_hwframe+0x72 [kernel]
+ __GI_accept+0x4d [libc.so.6]
+ pfd_accept+0x3a [ovs-vswitchd]
+ [unknown] 0x7fff19f2bd00
+ [unknown] 0xe4b8001f0f
+
+As you can see above, the output also shows the stackback trace. You can
+disable this using the ``--stack-trace-size 0`` option.
+
+As you can see above, the backtrace does not show a lot of useful information
+due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further
+analyze system call backtraces, you could use perf. The following perf
+script can do this for you (refer to the embedded instructions):
+
+https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py
+
+
+Using triggers
+--------------
+The tool supports start and, or stop triggers. This will allow you to capture
+statistics triggered by a specific event. The following combinations of
+stop-and-start triggers can be used.
+
+If you only use ``--start-trigger``, the inspection start when the trigger
+happens and runs until the ``--sample-time`` number of seconds has passed.
+The example below shows all the supported options in this scenario.
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \
+ --sample-count 4 --sample-interval 1
+
+
+If you only use ``--stop-trigger``, the inspection starts immediately and
+stops when the trigger happens. The example below shows all the supported
+options in this scenario.
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \
+ --sample-count 4 --sample-interval 1
+
+
+If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the
+statistics are captured between the two first occurrences of these events.
+The example below shows all the supported options in this scenario.
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --start-trigger up:bridge_run \
+ --stop-trigger upr:bridge_run \
+ --sample-count 4 --sample-interval 1 \
+ --trigger-delta 50000
+
+What triggers are supported? Note that what ``kernel_delay.py`` calls triggers,
+BCC [#BCC]_, calls events; these are eBPF tracepoints you can attach to.
+For more details on the supported tracepoints, check out the BCC
+documentation [#BCC_EVENT]_.
+
+The list below shows the supported triggers and their argument format:
+
+**USDT probes:**
+ [u|usdt]:{provider}:{probe}
+**Kernel tracepoint:**
+ [t:trace]:{system}:{event}
+**kprobe:**
+ [k:kprobe]:{kernel_function}
+**kretprobe:**
+ [kr:kretprobe]:{kernel_function}
+**uprobe:**
+ [up:uprobe]:{function}
+**uretprobe:**
+ [upr:uretprobe]:{function}
+
+Here are a couple of trigger examples, more use-case-specific examples can be
+found in the *Examples* section.
+
+.. code-block:: console
+
+ --start|stop-trigger u:udpif_revalidator:start_dump
+ --start|stop-trigger t:openvswitch:ovs_dp_upcall
+ --start|stop-trigger k:ovs_dp_process_packet
+ --start|stop-trigger kr:ovs_dp_process_packet
+ --start|stop-trigger up:bridge_run
+ --start|stop-trigger upr:bridge_run
+
+
+Examples
+--------
+This section will give some examples of how to use this tool in real-world
+scenarios. Let's start with the issue where Open vSwitch reports
+``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note
+that there is a blog available explaining how the revalidator process works
+in OVS [#REVAL_BLOG]_.
+
+First, let me explain this log message. It gets logged if the time delta
+between two ``poll_block()`` calls is more than 1 second. In other words,
+the process was spending a lot of time processing stuff that was made
+available by the return of the ``poll_block()`` function.
+
+Do a run with the tool using the existing USDT revalidator probes as a start
+and stop trigger (Note that the resource-specific data is removed from the none
+revalidator threads):
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done
+ # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 (12:52:00 UTC)
+ # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 (12:52:01 UTC)
+ # Triggered sample dump, stop-start delta 1,187,986,454 ns @2023-06-14T14:52:01.298021 (12:52:01 UTC)
+ TID THREAD <RESOURCE SPECIFIC>
+ ---------- ---------------- ----------------------------------------------------------------------------
+ 1457761 handler24 [SYSCALL STATISTICS]
+ NAME NUMBER COUNT TOTAL ns MAX ns
+ sendmsg 46 6110 123,274,761 41,776
+ recvmsg 47 136299 99,397,508 49,896
+ futex 202 51 7,655,832 7,536,776
+ poll 7 4068 1,202,883 2,907
+ getrusage 98 2034 586,602 1,398
+ sendto 44 9 213,682 27,417
+ TOTAL( - poll): 144503 231,128,385
+
+ [THREAD RUN STATISTICS]
+ SCHED_CNT TOTAL ns MIN ns MAX ns
+
+ [THREAD READY STATISTICS]
+ SCHED_CNT TOTAL ns MAX ns
+ 1 1,438 1,438
+
+ [SOFT IRQ STATISTICS]
+ NAME VECT_NR COUNT TOTAL ns MAX ns
+ sched 7 21 59,145 3,769
+ rcu 9 50 42,917 2,234
+ TOTAL: 71 102,062
+ 1457733 ovs-vswitchd [SYSCALL STATISTICS]
+ ...
+ 1457792 revalidator55 [SYSCALL STATISTICS]
+ NAME NUMBER COUNT TOTAL ns MAX ns
+ futex 202 73 572,576,329 19,621,600
+ recvmsg 47 815 296,697,618 405,338
+ sendto 44 3 78,302 26,837
+ sendmsg 46 3 38,712 13,250
+ write 1 1 5,073 5,073
+ TOTAL( - poll): 895 869,396,034
+
+ [THREAD RUN STATISTICS]
+ SCHED_CNT TOTAL ns MIN ns MAX ns
+ 48 394,350,393 1,729 140,455,796
+
+ [THREAD READY STATISTICS]
+ SCHED_CNT TOTAL ns MAX ns
+ 49 23,650 1,559
+
+ [SOFT IRQ STATISTICS]
+ NAME VECT_NR COUNT TOTAL ns MAX ns
+ sched 7 14 26,889 3,041
+ rcu 9 28 23,024 1,600
+ TOTAL: 42 49,913
+
+
+Above you see from the start of the output that the trigger took more than a
+second (1,187,986,454 ns), which is already know, by looking at the output of
+the ``ovs-vsctl upcall/show`` command.
+
+From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics you can see it
+spent almost 870ms handling syscalls, and there were no poll() calls being
+executed. The ``THREAD RUN STATISTICS`` statistics here are a bit misleading,
+as it looks like OVS only spent 394ms on the CPU. But earlier, it was mentioned
+that this time does not include the time being on the CPU at the start or stop
+of an event. What is exactly the case here, because USDT probes were used.
+
+From the above data and maybe some ``top`` output, it can be determined that
+the *revalidator55* thread is taking a lot of CPU time, probably because it
+has to do a lot of revalidator work by itself. The solution here is to increase
+the number of revalidator threads, so more work could be done in parallel.
+
+Here is another run of the same command in another scenario:
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump --stop-trigger u:udpif_revalidator:sweep_done
+ # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 (13:48:23 UTC)
+ # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 (13:48:23 UTC)
+ # Triggered sample dump, stop-start delta 263,181,230 ns @2023-06-14T15:48:23.781414 (13:48:23 UTC)
+ TID THREAD <RESOURCE SPECIFIC>
+ ---------- ---------------- ----------------------------------------------------------------------------
+ 1457733 ovs-vswitchd [SYSCALL STATISTICS]
+ ...
+ 1457792 revalidator55 [SYSCALL STATISTICS]
+ NAME NUMBER COUNT TOTAL ns MAX ns
+ recvmsg 47 284 193,422,110 46,248,418
+ sendto 44 2 46,685 23,665
+ sendmsg 46 2 24,916 12,703
+ write 1 1 6,534 6,534
+ TOTAL( - poll): 289 193,500,245
+
+ [THREAD RUN STATISTICS]
+ SCHED_CNT TOTAL ns MIN ns MAX ns
+ 2 47,333,558 331,516 47,002,042
+
+ [THREAD READY STATISTICS]
+ SCHED_CNT TOTAL ns MAX ns
+ 3 87,000,403 45,999,712
+
+ [SOFT IRQ STATISTICS]
+ NAME VECT_NR COUNT TOTAL ns MAX ns
+ sched 7 2 9,504 5,109
+ TOTAL: 2 9,504
+
+
+Here you can see the revalidator run took about 263ms, which does not look
+odd, however, the ``THREAD READY STATISTICS`` information shows that OVS was
+waiting 87ms for a CPU to be run on. This means the revalidator process could
+have finished 87ms faster. Looking at the ``MAX ns`` value, a worst-case delay
+of almost 46ms can be seen, which hints at an overloaded system.
+
+One final example that uses a ``uprobe`` to get some statistics on a
+``bridge_run()`` execution that takes more than 1ms.
+
+.. code-block:: console
+
+ $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run --trigger-delta 1000000
+ # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 (14:21:10 UTC)
+ # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 (14:21:10 UTC)
+ # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 (14:21:10 UTC)
+ # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 (14:21:10 UTC)
+ # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 (14:21:10 UTC)
+ # Triggered sample dump, stop-start delta 1,609,374 ns @2023-06-14T16:21:10.542565 (14:21:10 UTC)
+ TID THREAD <RESOURCE SPECIFIC>
+ ---------- ---------------- ----------------------------------------------------------------------------
+ 3371035 <unknown:3366258/3371035> [SYSCALL STATISTICS]
+ ... <REMOVED 7 MORE unknown THREADS>
+ 3371102 handler66 [SYSCALL STATISTICS]
+ ... <REMOVED 7 MORE HANDLER THREADS>
+ 3366258 ovs-vswitchd [SYSCALL STATISTICS]
+ NAME NUMBER COUNT TOTAL ns MAX ns
+ futex 202 43 403,469 199,312
+ clone3 435 13 174,394 30,731
+ munmap 11 8 115,774 21,861
+ poll 7 5 92,969 38,307
+ unlink 87 2 49,918 35,741
+ mprotect 10 8 47,618 13,201
+ accept 43 10 31,360 6,976
+ mmap 9 8 30,279 5,776
+ write 1 6 27,720 11,774
+ rt_sigprocmask 14 28 12,281 970
+ read 0 6 9,478 2,318
+ recvfrom 45 3 7,024 4,024
+ sendto 44 1 4,684 4,684
+ getrusage 98 5 4,594 1,342
+ close 3 2 2,918 1,627
+ recvmsg 47 1 2,722 2,722
+ TOTAL( - poll): 144 924,233
+
+ [THREAD RUN STATISTICS]
+ SCHED_CNT TOTAL ns MIN ns MAX ns
+ 13 817,605 5,433 524,376
+
+ [THREAD READY STATISTICS]
+ SCHED_CNT TOTAL ns MAX ns
+ 14 28,646 11,566
+
+ [SOFT IRQ STATISTICS]
+ NAME VECT_NR COUNT TOTAL ns MAX ns
+ rcu 9 1 2,838 2,838
+ TOTAL: 1 2,838
+
+ 3371110 revalidator74 [SYSCALL STATISTICS]
+ ... <REMOVED 7 MORE NEW revalidator THREADS>
+ 3366311 urcu3 [SYSCALL STATISTICS]
+ ...
+
+
+OVS removed some of the threads and their resource-specific data, but based
+on the ``<unknown:3366258/3371035>`` thread name, you can determine that some
+threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some
+``clone3`` syscalls, indicating threads were created. In this example, it was
+due to the deletion of a bridge, which resulted in the recreation of the
+revalidator and handler threads.
+
+
+Use with Openshift
+------------------
+This section describes how you would use the tool on a node in an OpenShift
+cluster. It assumes you have console access to the node, either directly or
+through a debug container.
+
+A base fedora38 container will be used through podman, as this will allow the
+use of some additional tools and packages needed.
+
+First the containers need to be started:
+
+.. code-block:: console
+
+ [core@sno-master ~]$ sudo podman run -it --rm \
+ -e PS1='[(DEBUG)\u@\h \W]\$ ' \
+ --privileged --network=host --pid=host \
+ -v /lib/modules:/lib/modules:ro \
+ -v /sys/kernel/debug:/sys/kernel/debug \
+ -v /proc:/proc \
+ -v /:/mnt/rootdir \
+ quay.io/fedora/fedora:38-x86_64
+
+ [(DEBUG)root@sno-master /]#
+
+
+Next add the ``linux_delay.py`` dependencies:
+
+.. code-block:: console
+
+ [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \
+ python3-pytz python3-psutil
+
+
+You need to install the devel, debug and source RPMs for your OVS and kernel
+version:
+
+.. code-block:: console
+
+ [(DEBUG)root@sno-master home]# rpm -i \
+ openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \
+ openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \
+ kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm
+
+
+Now the tool can be started. Here the above ``bridge_run()`` example is used:
+
+.. code-block:: console
+
+ [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger ur:bridge_run
+ # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 (11:44:07 UTC)
+ # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 (11:44:07 UTC)
+ # Triggered sample dump, stop-start delta 100,467 ns @2023-06-15T11:44:07.628569 (11:44:07 UTC)
+ TID THREAD <RESOURCE SPECIFIC>
+ ---------- ---------------- ----------------------------------------------------------------------------
+ 1246 ovs-vswitchd [SYSCALL STATISTICS]
+ NAME NUMBER COUNT TOTAL ns MAX ns
+ getdents64 217 2 8,560 8,162
+ openat 257 1 6,951 6,951
+ accept 43 4 6,942 3,763
+ recvfrom 45 1 3,726 3,726
+ recvmsg 47 2 2,880 2,188
+ stat 4 2 1,946 1,384
+ close 3 1 1,393 1,393
+ fstat 5 1 1,324 1,324
+ TOTAL( - poll): 14 33,722
+
+ [THREAD RUN STATISTICS]
+ SCHED_CNT TOTAL ns MIN ns MAX ns
+
+ [THREAD READY STATISTICS]
+ SCHED_CNT TOTAL ns MAX ns
+
+
+.. rubric:: Footnotes
+
+.. [#BCC] https://github.com/iovisor/bcc
+.. [#BCC_EVENT] https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
+.. [#REVAL_BLOG] https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained
This patch adds an utility that can be used to determine if an issue is related to a lack of Linux kernel resources. This tool is also featured in a Red Hat developers blog article: https://developers.redhat.com/articles/2023/07/24/troubleshooting-open-vswitch-kernel-blame Signed-off-by: Eelco Chaudron <echaudro@redhat.com> --- v2: Addressed review comments from Aaron. v3: Changed wording in documentation. utilities/automake.mk | 4 utilities/usdt-scripts/kernel_delay.py | 1402 +++++++++++++++++++++++++++++++ utilities/usdt-scripts/kernel_delay.rst | 596 +++++++++++++ 3 files changed, 2002 insertions(+) create mode 100755 utilities/usdt-scripts/kernel_delay.py create mode 100644 utilities/usdt-scripts/kernel_delay.rst