@@ -71,6 +71,9 @@ struct keepalive_shm {
/* Last seen timestamp of the core */
uint64_t core_last_seen_times[KEEPALIVE_MAXCORES];
+ /* Number of PMD failures */
+ uint32_t core_failures[KEEPALIVE_MAXCORES];
+
/* Store pmd thread tid */
pid_t thread_id[KEEPALIVE_MAXCORES];
@@ -606,6 +606,51 @@ dpdk_failcore_cb(void *ptr_data, const int core_id)
}
}
+static void
+dpdk_ka_handle_failure(enum keepalive_state fail_state, const int core_id,
+ const enum rte_keepalive_state core_state,
+ uint64_t last_alive, struct keepalive_shm *ka_shm)
+{
+ if (fail_state == KA_STATE_DEAD) {
+ /* If process is in DEFUNC/UNINTERRUPTIBLE/TRACED state it is inactive
+ * and no additional health checks are needed. */
+ uint32_t tid = ka_get_tid(core_id);
+ if (process_is_active(tid)) {
+ /* Enable PMD health check only when PMD is in 'RUNNING' state and
+ * still doesn't respond to heartbeats. Health checks are needed to
+ * analyze other stats as we are in penultimate state of declaring
+ * PMD as failed. */
+ ka_enable_pmd_health_check(core_id);
+ }
+ ka_set_pmd_state_ts(core_id, KA_STATE_DEAD, last_alive);
+ }
+
+ if (fail_state == KA_STATE_GONE) {
+ int pmd_hc_state = ka_get_pmd_health_check_state(core_id);
+
+ switch (pmd_hc_state) {
+ case PMD_HC_ENABLE:
+ break;
+ case PMD_HC_DISABLE:
+ VLOG_DBG_RL(&rl, "Health check disabled for PMD core:%d", core_id);
+ break;
+ case PMD_HC_PROGRESS:
+ ka_set_pmd_state_ts(core_id, KA_STATE_CHECK, last_alive);
+ break;
+
+ case PMD_HC_COMPLETE:
+ ka_shm->core_failures[core_id]++;
+ ka_set_pmd_state_ts(core_id, core_state, last_alive);
+ ka_disable_pmd_health_check(core_id);
+ break;
+
+ default:
+ VLOG_DBG_RL(&rl, "Unknown health check state %d", pmd_hc_state);
+ OVS_NOT_REACHED();
+ }
+ }
+}
+
/* Update the core state in shared memory.
*
* This function shall be invoked periodically to write the core status and
@@ -631,10 +676,16 @@ dpdk_ka_update_core_state(void *ptr_data, const int core_id,
case RTE_KA_STATE_MISSING:
ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
break;
- case RTE_KA_STATE_DOZING:
- case RTE_KA_STATE_SLEEP:
case RTE_KA_STATE_DEAD:
+ dpdk_ka_handle_failure(KA_STATE_DEAD, core_id, core_state,
+ last_alive, ka_shm);
+ break;
case RTE_KA_STATE_GONE:
+ dpdk_ka_handle_failure(KA_STATE_GONE, core_id, core_state,
+ last_alive, ka_shm);
+ break;
+ case RTE_KA_STATE_DOZING:
+ case RTE_KA_STATE_SLEEP:
ka_set_pmd_state_ts(core_id, core_state, last_alive);
break;
case RTE_KA_STATE_UNUSED:
The keepalive thread sends heartbeats to PMD thread and when PMD fails to respond to successive heartbeats the PMD is potentially stalled. The PMD state transition is as below: ALIVE -> MISSING -> DEAD -> GONE This commit enables PMD healthchecks when PMD doesn't respond to heartbeats. This is needed to handle false negatives. With this commit the new state transition is as below: ALIVE -> MISSING -> DEAD -> CHECK -> GONE PMD Health checking state is introduced and will immediately kickin when the PMD gets in to DEAD state. As part of this below are considered. - Link status of the ports polled by PMD thread. - Statistics of the ports polled by PMD thread. - PMD polling and processing cycles. Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> --- lib/keepalive.h | 3 +++ lib/netdev-dpdk.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 2 deletions(-)