[ovs-dev,v3,13/19] dpif-netdev: Add additional datapath health checks.

Submitted by Bhanuprakash Bodireddy on Aug. 4, 2017, 8:08 a.m.

Details

Message ID 1501834086-31829-14-git-send-email-bhanuprakash.bodireddy@intel.com
State New
Headers show

Commit Message

Bhanuprakash Bodireddy Aug. 4, 2017, 8:08 a.m.
This commit enables additional datapath health checks. The checks
are enabled only on a PMD heartbeat failure. On missing three successive
heartbeats additional health checks needs to be performed on respective
PMD thread to confirm the failure.

The datapath health is monitored periodically from keepalive thread.
It should be noted that the PMD health checks are only performed on
the PMD threads whose health check is enabled.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
---
 lib/dpif-netdev.c | 30 +++++++++++++++++++++
 lib/keepalive.c   | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/keepalive.h   | 16 +++++++++++
 3 files changed, 127 insertions(+)

Patch hide | download patch | download mbox

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 65db5fd..cf955e5 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -980,6 +980,35 @@  sorted_poll_thread_list(struct dp_netdev *dp,
     *n = k;
 }
 
+static void
+pmd_health_check(struct dp_netdev_pmd_thread *pmd OVS_UNUSED)
+{
+    /* Nothing */
+}
+
+static void
+get_datapath_health(struct dp_netdev *dp)
+{
+    static struct hmap *process_list = NULL;
+    if (!process_list) {
+        ka_load_process_list(&process_list);
+    }
+
+    struct ka_process_info *pinfo;
+    HMAP_FOR_EACH (pinfo, node, process_list) {
+        int core_id = pinfo->core_id;
+        struct dp_netdev_pmd_thread *pmd;
+
+        /* Check only PMD threads whose health check is enabled. */
+        if (OVS_LIKELY(pinfo->healthcheck == PMD_HC_DISABLE)) {
+            continue;
+        }
+
+        pmd = dp_netdev_get_pmd(dp, core_id);
+        pmd_health_check(pmd);
+    }
+}
+
 static void *
 ovs_keepalive(void *f_)
 {
@@ -991,6 +1020,7 @@  ovs_keepalive(void *f_)
         int n_pmds = cmap_count(&dp->poll_threads) - 1;
         if (n_pmds > 0) {
             dispatch_heartbeats();
+            get_datapath_health(dp);
             get_ka_stats();
         }
 
diff --git a/lib/keepalive.c b/lib/keepalive.c
index 43f8f11..b4d33cc 100644
--- a/lib/keepalive.c
+++ b/lib/keepalive.c
@@ -114,6 +114,7 @@  ka_register_thread(int tid, bool thread_is_pmd)
         ka_pinfo->heartbeats = true;
         ka_pinfo->core_id = core_num;
         ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name);
+        ka_pinfo->healthcheck = PMD_HC_DISABLE;
 
         hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
 
@@ -185,6 +186,78 @@  ka_mark_pmd_thread_sleep(void)
 }
 
 void
+ka_enable_pmd_health_check(unsigned core_id)
+{
+    if (ka_is_enabled()) {
+        struct ka_process_info *pinfo;
+        int tid = ka_get_pmd_tid(core_id);
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
+                                 &ka_info->process_list) {
+            if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
+                pinfo->healthcheck = PMD_HC_ENABLE;
+            }
+        }
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    }
+}
+
+void
+ka_disable_pmd_health_check(unsigned core_id)
+{
+    if (ka_is_enabled()) {
+        struct ka_process_info *pinfo;
+        int tid = ka_get_pmd_tid(core_id);
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
+                                 &ka_info->process_list) {
+            if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
+                pinfo->healthcheck = PMD_HC_DISABLE;
+            }
+        }
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    }
+}
+
+enum pmdhealth_check
+ka_get_pmd_health_check_state(unsigned core_id)
+    OVS_REQUIRES(ka_info->proclist_mutex)
+{
+    int hc = PMD_HC_DISABLE;
+    if (ka_is_enabled()) {
+        struct ka_process_info *pinfo;
+        int tid = ka_get_pmd_tid(core_id);
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
+                                 &ka_info->process_list) {
+            if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
+                hc = pinfo->healthcheck;
+            }
+        }
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    }
+
+    return hc;
+}
+
+void
+ka_set_pmd_health_check_state(unsigned core_id, enum pmdhealth_check state)
+{
+    if (ka_is_enabled()) {
+        struct ka_process_info *pinfo;
+        int tid = ka_get_pmd_tid(core_id);
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
+                                 &ka_info->process_list) {
+            if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
+                pinfo->healthcheck = state;
+            }
+        }
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+    }
+}
+
+void
 ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
                     uint64_t last_alive)
 {
@@ -202,6 +275,14 @@  ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
     ovs_mutex_unlock(&ka_info->proclist_mutex);
 }
 
+void
+ka_load_process_list(struct hmap **process_list)
+{
+    if (ka_is_enabled()) {
+        *process_list = &ka_info->process_list;
+    }
+}
+
 /* Retrieve and return the keepalive timer interval from OVSDB. */
 static uint32_t
 get_ka_timer_interval(const struct smap *ovs_other_config OVS_UNUSED)
diff --git a/lib/keepalive.h b/lib/keepalive.h
index cedc390..61697b2 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -40,11 +40,19 @@  enum keepalive_state {
     KA_STATE_CHECK = 7
 };
 
+enum pmdhealth_check {
+    PMD_HC_DISABLE,
+    PMD_HC_ENABLE,
+    PMD_HC_PROGRESS,
+    PMD_HC_COMPLETE
+};
+
 struct ka_process_info {
     char name[16];
     int tid;
     int core_id;
     bool heartbeats;
+    enum pmdhealth_check healthcheck;
     enum keepalive_state core_state;
     uint64_t core_last_seen_times;
     struct hmap_node node;
@@ -95,6 +103,13 @@  void ka_unregister_thread(int, bool);
 void ka_mark_pmd_thread_alive(void);
 void ka_mark_pmd_thread_sleep(void);
 
+void ka_init_pmd_health_check(void);
+void ka_enable_pmd_health_check(unsigned);
+void ka_disable_pmd_health_check(unsigned);
+bool ka_is_pmdhealth_check_enabled(unsigned);
+enum pmdhealth_check ka_get_pmd_health_check_state(unsigned);
+void ka_set_pmd_health_check_state(unsigned, enum pmdhealth_check);
+
 void ka_store_pmd_id(unsigned core);
 uint32_t get_ka_interval(void);
 int get_ka_init_status(void);
@@ -102,6 +117,7 @@  int ka_alloc_portstats(unsigned, int);
 void ka_destroy_portstats(void);
 void get_ka_stats(void);
 struct smap *ka_stats_run(void);
+void ka_load_process_list(struct hmap **);
 
 void dispatch_heartbeats(void);
 #endif /* keepalive.h */