diff mbox

[ovs-dev,RFC,v2,17/19] netdev-dpdk: Enable PMD health checks on heartbeat failure.

Message ID 1497286187-69287-18-git-send-email-bhanuprakash.bodireddy@intel.com
State Superseded
Headers show

Commit Message

Bodireddy, Bhanuprakash June 12, 2017, 4:49 p.m. UTC
The keepalive thread sends heartbeats to PMD thread and when PMD fails to
respond to successive heartbeats the PMD is potentially stalled. The PMD
state transition is as below:

ALIVE -> MISSING -> DEAD -> GONE

This commit enables PMD healthchecks when PMD doesn't respond to
heartbeats. This is needed to handle false negatives. With this commit
the new state transition is as below:

ALIVE -> MISSING -> DEAD -> CHECK -> GONE

PMD Health checking state is introduced and will immediately kickin when
the PMD gets in to DEAD state. As part of this below are considered.

  - Link status of the ports polled by PMD thread.
  - Statistics of the ports polled by PMD thread.
  - PMD polling and processing cycles.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
---
 lib/keepalive.h   |  3 +++
 lib/netdev-dpdk.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)
diff mbox

Patch

diff --git a/lib/keepalive.h b/lib/keepalive.h
index 7501065..36789ee 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -71,6 +71,9 @@  struct keepalive_shm {
     /* Last seen timestamp of the core */
     uint64_t core_last_seen_times[KEEPALIVE_MAXCORES];
 
+    /* Number of PMD failures */
+    uint32_t core_failures[KEEPALIVE_MAXCORES];
+
     /* Store pmd thread tid */
     pid_t thread_id[KEEPALIVE_MAXCORES];
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 24a87bb..15c8c68 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -606,6 +606,51 @@  dpdk_failcore_cb(void *ptr_data, const int core_id)
     }
 }
 
+static void
+dpdk_ka_handle_failure(enum keepalive_state fail_state, const int core_id,
+               const enum rte_keepalive_state core_state,
+               uint64_t last_alive, struct keepalive_shm *ka_shm)
+{
+    if (fail_state == KA_STATE_DEAD) {
+        /* If process is in DEFUNC/UNINTERRUPTIBLE/TRACED state it is inactive
+         * and no additional health checks are needed. */
+        uint32_t tid = ka_get_tid(core_id);
+        if (process_is_active(tid)) {
+           /* Enable PMD health check only when PMD is in 'RUNNING' state and
+            * still doesn't respond to heartbeats. Health checks are needed to
+            * analyze other stats as we are in penultimate state of declaring
+            * PMD as failed. */
+            ka_enable_pmd_health_check(core_id);
+        }
+        ka_set_pmd_state_ts(core_id, KA_STATE_DEAD, last_alive);
+    }
+
+    if (fail_state == KA_STATE_GONE) {
+        int pmd_hc_state = ka_get_pmd_health_check_state(core_id);
+
+        switch (pmd_hc_state) {
+        case PMD_HC_ENABLE:
+            break;
+        case PMD_HC_DISABLE:
+            VLOG_DBG_RL(&rl, "Health check disabled for PMD core:%d", core_id);
+            break;
+        case PMD_HC_PROGRESS:
+            ka_set_pmd_state_ts(core_id, KA_STATE_CHECK, last_alive);
+            break;
+
+        case PMD_HC_COMPLETE:
+            ka_shm->core_failures[core_id]++;
+            ka_set_pmd_state_ts(core_id, core_state, last_alive);
+            ka_disable_pmd_health_check(core_id);
+            break;
+
+        default:
+            VLOG_DBG_RL(&rl, "Unknown health check state %d", pmd_hc_state);
+            OVS_NOT_REACHED();
+        }
+    }
+}
+
 /* Update the core state in shared memory.
  *
  * This function shall be invoked periodically to write the core status and
@@ -631,10 +676,16 @@  dpdk_ka_update_core_state(void *ptr_data, const int core_id,
     case RTE_KA_STATE_MISSING:
         ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
         break;
-    case RTE_KA_STATE_DOZING:
-    case RTE_KA_STATE_SLEEP:
     case RTE_KA_STATE_DEAD:
+        dpdk_ka_handle_failure(KA_STATE_DEAD, core_id, core_state,
+                               last_alive, ka_shm);
+        break;
     case RTE_KA_STATE_GONE:
+        dpdk_ka_handle_failure(KA_STATE_GONE, core_id, core_state,
+                               last_alive, ka_shm);
+        break;
+    case RTE_KA_STATE_DOZING:
+    case RTE_KA_STATE_SLEEP:
         ka_set_pmd_state_ts(core_id, core_state, last_alive);
         break;
     case RTE_KA_STATE_UNUSED: