@@ -48,3 +48,27 @@ dpdk_get_vhost_sock_dir(void)
{
return NULL;
}
+
+void
+dpdk_ka_register_core(unsigned core_id OVS_UNUSED)
+{
+ /* Nothing */
+}
+
+void
+dpdk_ka_mark_core_alive(void)
+{
+ /* Nothing */
+}
+
+void
+dpdk_ka_shm_store_tid(unsigned core_id OVS_UNUSED)
+{
+ /* Nothing */
+}
+
+bool
+dpdk_is_ka_enabled(void)
+{
+ return false;
+}
@@ -15,12 +15,14 @@
*/
#include <config.h>
+#include <stdbool.h>
#include "dpdk.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <getopt.h>
+#include <rte_keepalive.h>
#include <rte_memzone.h>
#ifdef DPDK_PDUMP
#include <rte_mempool.h>
@@ -38,6 +40,10 @@ VLOG_DEFINE_THIS_MODULE(dpdk);
static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
+bool ovs_keepalive_enable = false; /* KA feature disabled by default */
+static int keepalive_timer_interval; /* OvS-DPDK keepalive timer interval */
+static const char *keepalive_shm_blk = NULL;
+
static int
process_vhost_flags(char *flag, char *default_val, int size,
const struct smap *ovs_other_config,
@@ -63,6 +69,35 @@ process_vhost_flags(char *flag, char *default_val, int size,
return changed;
}
+/* Retrieve and return the keepalive timer interval from OVSDB. */
+static int
+get_ka_timer_interval(const struct smap *ovs_other_config)
+{
+#define OVS_KEEPALIVE_TIMEOUT 100 /* Default timeout set to 100ms */
+ int ka_interval;
+
+ /* Timer granularity in milliseconds
+ * Defaults to OVS_KEEPALIVE_TIMEOUT(ms) if not set */
+ ka_interval = smap_get_int(ovs_other_config, "dpdk-keepalive-interval",
+ OVS_KEEPALIVE_TIMEOUT);
+
+ VLOG_INFO("The keepalive timer interval: %d\n", ka_interval);
+ return ka_interval;
+}
+
+static const char *
+get_ka_shm_block(const struct smap *ovs_other_config)
+{
+#define OVS_KEEPALIVE_SHM_NAME /dpdk_keepalive_shm_name /* Shared mem block. */
+ keepalive_shm_blk = smap_get(ovs_other_config, "dpdk-keepalive-shm-name");
+ if (!keepalive_shm_blk) {
+ keepalive_shm_blk = OVS_STRINGIZE(OVS_KEEPALIVE_SHM_NAME);
+ }
+
+ VLOG_INFO("The keepalive Shared Memory block: %s\n", keepalive_shm_blk);
+ return keepalive_shm_blk;
+}
+
static char **
grow_argv(char ***argv, size_t cur_siz, size_t grow_by)
{
@@ -281,6 +316,15 @@ dpdk_init__(const struct smap *ovs_other_config)
}
VLOG_INFO("DPDK Enabled, initializing");
+
+ if (smap_get_bool(ovs_other_config, "dpdk-keepalive", false)) {
+ ovs_keepalive_enable = true;
+ VLOG_INFO("OVSDPDK keepalive enabled \n");
+
+ keepalive_timer_interval = get_ka_timer_interval(ovs_other_config);
+ keepalive_shm_blk = get_ka_shm_block(ovs_other_config);
+ }
+
if (process_vhost_flags("vhost-sock-dir", xstrdup(ovs_rundir()),
NAME_MAX, ovs_other_config,
&sock_dir_subcomponent)) {
@@ -435,3 +479,42 @@ dpdk_set_lcore_id(unsigned cpu)
ovs_assert(cpu != NON_PMD_CORE_ID);
RTE_PER_LCORE(_lcore_id) = cpu;
}
+
+/* Return 'true' if KA enabled, otherwise 'false'. */
+inline bool
+dpdk_is_ka_enabled()
+{
+ return ovs_keepalive_enable;
+}
+
+/* Return the Keepalive timer interval. */
+uint64_t
+dpdk_get_ka_interval(void)
+{
+ return keepalive_timer_interval;
+}
+
+/* Return the Keepalive shared memory block name. */
+const char *
+dpdk_get_ka_shm(void)
+{
+ return keepalive_shm_blk;
+}
+
+/* Register Packet processing core 'core_id' for liveness checks. */
+void
+dpdk_ka_register_core(unsigned core_id)
+{
+ if (dpdk_is_ka_enabled()) {
+ rte_keepalive_register_core(rte_global_keepalive_info, core_id);
+ }
+}
+
+/* Mark Packet processing core alive. */
+void
+dpdk_ka_mark_core_alive(void)
+{
+ if (dpdk_is_ka_enabled()) {
+ rte_keepalive_mark_alive(rte_global_keepalive_info);
+ }
+}
@@ -26,6 +26,9 @@
#else
+#include <stdint.h>
+#include <stdbool.h>
+
#define NON_PMD_CORE_ID UINT32_MAX
#endif /* DPDK_NETDEV */
@@ -35,5 +38,10 @@ struct smap;
void dpdk_init(const struct smap *ovs_other_config);
void dpdk_set_lcore_id(unsigned cpu);
const char *dpdk_get_vhost_sock_dir(void);
-
+uint64_t dpdk_get_ka_interval(void);
+bool dpdk_is_ka_enabled(void);
+void dpdk_ka_register_core(unsigned core_id);
+void dpdk_ka_mark_core_alive(void);
+const char *dpdk_get_ka_shm(void);
+void dpdk_ka_shm_store_tid(unsigned core_id);
#endif /* dpdk.h */
@@ -3141,6 +3141,9 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
+
+ /* Register core for Keepalive detection. */
+ dpdk_ka_register_core(core->core_id);
}
/* Log the number of pmd threads per numa node. */
@@ -3522,6 +3525,9 @@ pmd_thread_main(void *f_)
ovs_numa_thread_setaffinity_core(pmd->core_id);
dpdk_set_lcore_id(pmd->core_id);
poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
+
+ /* Store the pmd thread_id in shared memory. */
+ dpdk_ka_shm_store_tid(pmd->core_id);
reload:
emc_cache_init(&pmd->flow_cache);
@@ -3546,6 +3552,9 @@ reload:
poll_list[i].port_no);
}
+ /* Mark packet processing core alive if Keepalive is enabled. */
+ dpdk_ka_mark_core_alive();
+
if (lc++ > 1024) {
bool reload;
@@ -22,6 +22,9 @@
#include <stdlib.h>
#include <errno.h>
#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
#include <rte_config.h>
#include <rte_cycles.h>
@@ -32,6 +35,7 @@
#include <rte_mbuf.h>
#include <rte_meter.h>
#include <rte_virtio_net.h>
+#include <rte_keepalive.h>
#include "dirs.h"
#include "dp-packet.h"
@@ -48,8 +52,9 @@
#include "ovs-numa.h"
#include "ovs-thread.h"
#include "ovs-rcu.h"
-#include "packets.h"
#include "openvswitch/shash.h"
+#include "packets.h"
+#include "process.h"
#include "smap.h"
#include "sset.h"
#include "unaligned.h"
@@ -391,6 +396,29 @@ struct netdev_rxq_dpdk {
int port_id;
};
+/*
+ * OVS Shared Memory structure
+ *
+ * The information in the shared memory block will be read by collectd.
+ * */
+struct dpdk_keepalive_shm {
+ /* IPC semaphore. Posted when a core dies */
+ sem_t core_died;
+
+ /*
+ * Relayed status of each core.
+ * UNUSED[0], ALIVE[1], DEAD[2], GONE[3], MISSING[4], DOZING[5], SLEEP[6]
+ **/
+ enum rte_keepalive_state core_state[RTE_KEEPALIVE_MAXCORES];
+
+ /* Last seen timestamp of the core */
+ uint64_t core_last_seen_times[RTE_KEEPALIVE_MAXCORES];
+
+ /* Store pmd thread tid */
+ pid_t thread_id[RTE_KEEPALIVE_MAXCORES];
+};
+
+static struct dpdk_keepalive_shm *ka_shm;
static int netdev_dpdk_class_init(void);
static int netdev_dpdk_vhost_class_init(void);
@@ -573,6 +601,175 @@ netdev_dpdk_mempool_configure(struct netdev_dpdk *dev)
return 0;
}
+/* Callback function invoked on heartbeat miss. Verify if it is genuine
+ * heartbeat miss or a false positive and log the message accordingly.
+ */
+static void
+dpdk_failcore_cb(void *ptr_data, const int core_id)
+{
+ pid_t tid = syscall(SYS_gettid);
+ VLOG_INFO("%s_%d: Called by threadid:%d\n", __FUNCTION__, __LINE__, tid);
+
+ struct dpdk_keepalive_shm *ka_shm = (struct dpdk_keepalive_shm *)ptr_data;
+
+ if (ka_shm) {
+ int tid = ka_shm->thread_id[core_id];
+
+ if (get_process_status(tid) != ACTIVE_STATE) {
+ VLOG_INFO("Pmd thread tid %d on core[%d] is unresponsive \n", tid, core_id);
+ } else {
+ VLOG_INFO("False positive!!, pmd thread alive, tid(%d)\n", tid);
+ }
+ }
+}
+
+/* Notify the external monitoring application for change in core state.
+ *
+ * On a consecutive heartbeat miss the core is considered dead and the status
+ * is relayed to monitoring framework by unlocking the semaphore.
+ */
+static void
+dpdk_ka_relay_core_state(void *ptr_data, const int core_id,
+ const enum rte_keepalive_state core_state, uint64_t last_alive)
+{
+ struct dpdk_keepalive_shm *ka_shm = (struct dpdk_keepalive_shm *)ptr_data;
+ int count;
+
+ if (!ka_shm) {
+ VLOG_DBG("Invalid shared memory block\n");
+ return;
+ }
+
+ VLOG_DBG("TS(%lu):CORE%d, Old state:%d, current_state:%d, last-alive:%ld\n",
+ (unsigned long)time(NULL), core_id, ka_shm->core_state[core_id],
+ core_state, last_alive);
+
+ switch (core_state) {
+ /* TBD - Fix this, post monitoring framework changes to handle
+ * all core states
+ * */
+ case RTE_KA_STATE_ALIVE:
+ case RTE_KA_STATE_MISSING:
+ case RTE_KA_STATE_DOZING:
+ case RTE_KA_STATE_SLEEP:
+ ka_shm->core_state[core_id] = RTE_KA_STATE_ALIVE;
+ ka_shm->core_last_seen_times[core_id] = last_alive;
+ break;
+ case RTE_KA_STATE_DEAD:
+ case RTE_KA_STATE_GONE:
+ ka_shm->core_state[core_id] = core_state;
+ ka_shm->core_last_seen_times[core_id] = last_alive;
+ break;
+ case RTE_KA_STATE_UNUSED:
+ ka_shm->core_state[core_id] = RTE_KA_STATE_UNUSED;
+ break;
+ }
+
+ if (OVS_UNLIKELY(core_state == RTE_KA_STATE_DEAD)) {
+ /* To handle inactive collectd, increment the semaphore
+ * if count is '0'. */
+ if (sem_getvalue(&ka_shm->core_died, &count) == -1) {
+ VLOG_WARN("Semaphore check failed\n");
+ return;
+ }
+
+ if (count > 1) {
+ return;
+ }
+
+ VLOG_DBG("Going to post semaphore\n");
+ if (sem_post(&ka_shm->core_died) != 0) {
+ VLOG_INFO("Failed to increment semaphore\n");
+ }
+ }
+}
+
+/* Create POSIX Shared memory object and initialize the semaphore. */
+static
+struct dpdk_keepalive_shm *dpdk_keepalive_shm_create(void)
+{
+ int fd;
+ int coreid;
+ struct dpdk_keepalive_shm *ka_shm;
+ char ka_shmblk[40];
+
+ sprintf(ka_shmblk, "%s", dpdk_get_ka_shm());
+ if (shm_unlink(ka_shmblk) == -1 && errno != ENOENT) {
+ printf("Warning: Error unlinking stale %s \n", ka_shmblk);
+ }
+
+ if ((fd = shm_open(ka_shmblk,
+ O_CREAT | O_TRUNC | O_RDWR, 0666)) < 0) {
+ VLOG_WARN("Failed to open %s as SHM \n", ka_shmblk);
+ } else if (ftruncate(fd, sizeof(struct dpdk_keepalive_shm)) != 0) {
+ VLOG_WARN("Failed to resize SHM \n");
+ } else {
+ ka_shm = (struct dpdk_keepalive_shm *) mmap(
+ 0, sizeof(struct dpdk_keepalive_shm),
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (ka_shm == MAP_FAILED) {
+ VLOG_WARN("Failed to mmap SHM \n");
+ } else {
+ memset(ka_shm, 0, sizeof(struct dpdk_keepalive_shm));
+
+ /* Initialize the semaphores for IPC/SHM use */
+ if (sem_init(&ka_shm->core_died, 1, 0) != 0) {
+ VLOG_WARN("Failed to setup SHM semaphore \n");
+ return NULL;
+ }
+
+ /* Mark all cores to 'not present' */
+ for (coreid = 0; coreid < RTE_KEEPALIVE_MAXCORES; coreid++) {
+ ka_shm->core_state[coreid] = RTE_KA_STATE_UNUSED;
+ ka_shm->core_last_seen_times[coreid] = 0;
+ }
+
+ return ka_shm;
+ }
+ }
+ return NULL;
+}
+
+/* Initialize Keepalive sub-system and register callback. */
+int
+keepalive_init(void)
+{
+ /* Create shared memory block */
+ ka_shm = dpdk_keepalive_shm_create();
+ if (ka_shm == NULL) {
+ VLOG_ERR("dpdk_keepalive_shm_create() failed\n");
+ return -1;
+ }
+
+ /* Initialize keepalive subsystem */
+ if ((rte_global_keepalive_info =
+ rte_keepalive_create(&dpdk_failcore_cb, ka_shm)) == NULL) {
+ VLOG_ERR("Keepalive initialization failed\n");
+ return -1;
+ } else {
+ rte_keepalive_register_relay_callback(rte_global_keepalive_info,
+ dpdk_ka_relay_core_state, ka_shm);
+ }
+
+ return 0;
+}
+
+/* Keepalive thread. */
+static void *
+ovs_keepalive(void *dummy OVS_UNUSED)
+{
+ uint64_t keepalive_timer_interval = dpdk_get_ka_interval();
+ pthread_detach(pthread_self());
+
+ for (;;) {
+ rte_keepalive_dispatch_pings(NULL, rte_global_keepalive_info);
+ xusleep(keepalive_timer_interval * 1000);
+ }
+
+ return NULL;
+}
+
static void
check_link_status(struct netdev_dpdk *dev)
{
@@ -2560,6 +2757,13 @@ netdev_dpdk_class_init(void)
* needs to be done only once */
if (ovsthread_once_start(&once)) {
ovs_thread_create("dpdk_watchdog", dpdk_watchdog, NULL);
+
+ if (dpdk_is_ka_enabled()) {
+ if (keepalive_init() != -1) {
+ ovs_thread_create("ovs_keepalive", ovs_keepalive, NULL);
+ }
+ }
+
unixctl_command_register("netdev-dpdk/set-admin-state",
"[netdev] up|down", 1, 2,
netdev_dpdk_set_admin_state, NULL);
@@ -3191,3 +3395,15 @@ netdev_dpdk_register(void)
netdev_register_provider(&dpdk_vhost_class);
netdev_register_provider(&dpdk_vhost_client_class);
}
+
+void
+dpdk_ka_shm_store_tid(unsigned core_id)
+{
+ if (dpdk_is_ka_enabled()) {
+#ifndef _WIN32
+ if (ka_shm) {
+ ka_shm->thread_id[core_id] = syscall(SYS_gettid);
+ }
+#endif
+ }
+}
@@ -18,7 +18,7 @@
#define NETDEV_DPDK_H
#include <config.h>
-
+#include <semaphore.h>
#include "openvswitch/compiler.h"
struct dp_packet;
@@ -28,6 +28,11 @@ struct dp_packet;
void netdev_dpdk_register(void);
void free_dpdk_buf(struct dp_packet *);
+int keepalive_init(void);
+void keepalive_create_thread(void);
+void dpdk_ka_shm_store_tid(unsigned);
+
+struct rte_keepalive *rte_global_keepalive_info;
#else
static inline void
@@ -50,6 +50,20 @@ struct process {
int status;
};
+struct pstate2Num {
+ char *tidState;
+ int num;
+};
+
+const struct pstate2Num pstate_map[] = {
+ { "S", STOPPED_STATE },
+ { "R", ACTIVE_STATE },
+ { "t", TRACED_STATE },
+ { "Z", DEFUNC_STATE },
+ { "D", UNINTERRUPTIBLE_SLEEP_STATE },
+ { "NULL", UNUSED_STATE },
+};
+
/* Pipe used to signal child termination. */
static int fds[2];
@@ -390,6 +404,49 @@ process_run(void)
#endif
}
+int
+get_process_status(int tid)
+{
+#ifndef _WIN32
+ static char process_name[20];
+ FILE *stream;
+ char line[256];
+ char Name[10], value[5], status[10];
+ int i, ln;
+
+ snprintf(process_name, sizeof(process_name),
+ "/proc/%d/status", tid);
+ stream = fopen(process_name, "r");
+ if (stream == NULL) {
+ VLOG_WARN_ONCE("%s: open failed: %s", process_name,
+ ovs_strerror(errno));
+ return errno;
+ }
+
+ ln=0;
+ while (fgets(line, sizeof line, stream)) {
+ if (!ovs_scan(line,
+ "%16s %4s %14s\n",
+ Name, value, status)) {
+ VLOG_WARN_ONCE("%s: could not parse line %d: %s",
+ process_name, ln, line);
+ continue;
+ }
+ if (!strcmp(Name, "State:")) {
+ for (i=0; pstate_map[i].tidState != NULL; i++) {
+ if (strcmp(pstate_map[i].tidState, value) == 0) {
+ VLOG_DBG("The state is %s, status is %d\n",
+ pstate_map[i].tidState, pstate_map[i].num);
+ return pstate_map[i].num;
+ }
+ }
+ break;
+ }
+ ln++;
+ }
+ return 0;
+#endif
+}
/* Causes the next call to poll_block() to wake up when process 'p' has
* exited. */
@@ -20,6 +20,15 @@
#include <stdbool.h>
#include <sys/types.h>
+enum process_states {
+ UNUSED_STATE,
+ STOPPED_STATE,
+ ACTIVE_STATE,
+ TRACED_STATE,
+ DEFUNC_STATE,
+ UNINTERRUPTIBLE_SLEEP_STATE
+};
+
struct process;
/* Starting and monitoring subprocesses.
@@ -38,6 +47,7 @@ bool process_exited(struct process *);
int process_status(const struct process *);
void process_run(void);
void process_wait(struct process *);
+int get_process_status(int);
/* These functions are thread-safe. */
char *process_status_msg(int);
@@ -2099,6 +2099,18 @@ xsleep(unsigned int seconds)
ovsrcu_quiesce_end();
}
+void
+xusleep(unsigned int microseconds)
+{
+ ovsrcu_quiesce_start();
+#ifdef _WIN32
+ Sleep(microseconds/1000);
+#else
+ usleep(microseconds);
+#endif
+ ovsrcu_quiesce_end();
+}
+
/* Determine whether standard output is a tty or not. This is useful to decide
* whether to use color output or not when --color option for utilities is set
* to `auto`.
@@ -449,6 +449,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
}
void xsleep(unsigned int seconds);
+void xusleep(unsigned int microseconds);
bool is_stdout_a_tty(void);
@@ -276,6 +276,45 @@
</p>
</column>
+ <column name="other_config" key="dpdk-keepalive"
+ type='{"type": "boolean"}'>
+ <p>
+ Set this value to <code>true</code> to enable DPDK keepalive
+ feature.
+ </p>
+ <p>
+ The default value is <code>false</code>. Changing this value requires
+ restarting the daemon
+ </p>
+ <p>
+ If this value is <code>false</code> at startup, keepalive thread
+ shall not be spawned.
+ </p>
+ </column>
+
+ <column name="other_config" key="dpdk-keepalive-interval"
+ type='{"type": "integer", "minInteger": 1}'>
+ <p>
+ Specifies the DPDK keepalive interval value.
+ </p>
+ <p>
+ If not specified, this will be set to 100 milliseconds (default
+ value). Changing this value requires restarting the daemon.
+ </p>
+ </column>
+
+ <column name="other_config" key="dpdk-keepalive-shm-name"
+ type='{"type": "string"}'>
+ <p>
+ Specifies the DPDK keepalive shared memory block name.
+ </p>
+ <p>
+ If not specified, shared memory block named "dpdk_keepalive_shm_name"
+ (default name) is created. Changing this value requires restarting
+ the daemon.
+ </p>
+ </column>
+
<column name="other_config" key="dpdk-extra"
type='{"type": "string"}'>
<p>
This patch is aimed at achieving Fastpath Service Assurance in OVS-DPDK deployments. This commit adds support for monitoring the packet processing cores(pmd thread cores) by dispatching heartbeats at regular intervals. Incase of heartbeat miss the failure shall be detected & reported to higher level fault management systems/frameworks. The implementation uses POSIX shared memory object for storing the events that will be read by monitoring framework. keep-alive feature can be enabled through below OVSDB settings. dpdk-keepalive=true - Keepalive feature is disabled by default dpdk-keepalive-interval="50" - Timer interval in milliseconds for monitoring the packet processing cores. dpdk-keepalive-shm-name="/dpdk_keepalive_shm_name" - Shared memory block name where the events shall be updated. When KA is enabled, 'ovs-keepalive' thread shall be spawned that wakes up at regular intervals to update the timestamp and status of pmd cores in shared memory region. An external monitoring framework like collectd(with dpdk plugin support) can read the status updates from shared memory. On a missing heartbeat, the collectd shall relay the status to ceilometer service in the controller. Below is the very high level overview of deployment model. Compute Node Controller Collectd <-----------------> Ceilometer OVS DPDK +-----+ | VM | +--+--+ \---+---/ | +--+---+ +------------+----------+ +------+-------+ | OVS |-----> | collectd DPDK plugin | --> | collectd | +--+---+ +------------+----------+ +------+-------+ +------+-----+ +---------------+------------+ | Ceilometer | <-- | collectd ceilometer plugin | <---- +------+-----+ +---------------+------------+ Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> --- This patch is based on commit '8b6987d799fb0bc530ebb7f767767b1c661548c9' and [PATCH v2 00/19] DPDK/pmd reconfiguration refactor and bugfixes Will post v1 of this patch with documentation updates once "v2 of pmd reconfiguration" and the documentation patches are upstreamed. lib/dpdk-stub.c | 24 ++++++ lib/dpdk.c | 83 ++++++++++++++++++++ lib/dpdk.h | 10 ++- lib/dpif-netdev.c | 9 +++ lib/netdev-dpdk.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++++++- lib/netdev-dpdk.h | 7 +- lib/process.c | 57 ++++++++++++++ lib/process.h | 10 +++ lib/util.c | 12 +++ lib/util.h | 1 + vswitchd/vswitch.xml | 39 +++++++++ 11 files changed, 467 insertions(+), 3 deletions(-)