diff mbox

[ovs-dev,v4,2/7] Keepalive: Add initial keepalive support.

Message ID 1503394869-76348-3-git-send-email-bhanuprakash.bodireddy@intel.com
State Superseded
Headers show

Commit Message

Bodireddy, Bhanuprakash Aug. 22, 2017, 9:41 a.m. UTC
This commit introduces the initial keepalive support by adding
'keepalive' module and also helper and initialization functions
that will be invoked by later commits.

This commit adds new ovsdb column "keepalive" that shows the status
of the datapath threads. This is implemented for DPDK datapath and
only status of PMD threads is reported.

For eg:
  To enable keepalive feature.
  'ovs-vsctl --no-wait set Open_vSwitch . other_config:enable-keepalive=true'

  To set timer interval of 5000ms for monitoring packet processing cores.
  'ovs-vsctl --no-wait set Open_vSwitch . \
     other_config:keepalive-interval="5000"

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
---
 lib/automake.mk            |   2 +
 lib/keepalive.c            | 183 +++++++++++++++++++++++++++++++++++++++++++++
 lib/keepalive.h            |  87 +++++++++++++++++++++
 vswitchd/bridge.c          |   3 +
 vswitchd/vswitch.ovsschema |   8 +-
 vswitchd/vswitch.xml       |  49 ++++++++++++
 6 files changed, 330 insertions(+), 2 deletions(-)
 create mode 100644 lib/keepalive.c
 create mode 100644 lib/keepalive.h

Comments

Fischetti, Antonio Sept. 5, 2017, 7:03 a.m. UTC | #1
Comments inline.

> -----Original Message-----
> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-bounces@openvswitch.org]
> On Behalf Of Bhanuprakash Bodireddy
> Sent: Tuesday, August 22, 2017 10:41 AM
> To: dev@openvswitch.org
> Cc: i.maximets@samsung.com
> Subject: [ovs-dev] [PATCH v4 2/7] Keepalive: Add initial keepalive support.
> 
> This commit introduces the initial keepalive support by adding
> 'keepalive' module and also helper and initialization functions
> that will be invoked by later commits.
> 
> This commit adds new ovsdb column "keepalive" that shows the status
> of the datapath threads. This is implemented for DPDK datapath and
> only status of PMD threads is reported.
> 
> For eg:
>   To enable keepalive feature.
>   'ovs-vsctl --no-wait set Open_vSwitch . other_config:enable-keepalive=true'

[Antonio]
It would help saying that this command must be run 'before'
launching ovs-vswitchd otherwise has no effect.

> 
>   To set timer interval of 5000ms for monitoring packet processing cores.
>   'ovs-vsctl --no-wait set Open_vSwitch . \
>      other_config:keepalive-interval="5000"
> 
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
> ---
>  lib/automake.mk            |   2 +
>  lib/keepalive.c            | 183 +++++++++++++++++++++++++++++++++++++++++++++
>  lib/keepalive.h            |  87 +++++++++++++++++++++
>  vswitchd/bridge.c          |   3 +
>  vswitchd/vswitch.ovsschema |   8 +-
>  vswitchd/vswitch.xml       |  49 ++++++++++++
>  6 files changed, 330 insertions(+), 2 deletions(-)
>  create mode 100644 lib/keepalive.c
>  create mode 100644 lib/keepalive.h
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 2415f4c..0d99f0a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -110,6 +110,8 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/json.c \
>  	lib/jsonrpc.c \
>  	lib/jsonrpc.h \
> +	lib/keepalive.c \
> +	lib/keepalive.h \
>  	lib/lacp.c \
>  	lib/lacp.h \
>  	lib/latch.h \
> diff --git a/lib/keepalive.c b/lib/keepalive.c
> new file mode 100644
> index 0000000..ac73a42
> --- /dev/null
> +++ b/lib/keepalive.c
> @@ -0,0 +1,183 @@
> +/*
> + * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <stdbool.h>
> +#include <unistd.h>
> +
> +#include "keepalive.h"
> +#include "lib/vswitch-idl.h"
> +#include "openvswitch/vlog.h"
> +#include "timeval.h"
> +
> +VLOG_DEFINE_THIS_MODULE(keepalive);
> +
> +static bool keepalive_enable = false;    /* Keepalive disabled by default */
> +static bool ka_init_status = ka_init_failure; /* Keepalive initialization */
> +static uint32_t keepalive_timer_interval;     /* keepalive timer interval */
> +static struct keepalive_info *ka_info = NULL;
> +
> +inline bool
> +ka_is_enabled(void)
> +{
> +    return keepalive_enable;
> +}
> +
> +inline int
> +ka_get_pmd_tid(unsigned core_idx)
> +{
> +    if (ka_is_enabled()) {
> +        return ka_info->thread_id[core_idx];
> +    }
> +
> +    return -EINVAL;
> +}
> +
> +void
> +ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
> +                    uint64_t last_alive)
> +{
> +    struct ka_process_info *pinfo;
> +    int tid = ka_get_pmd_tid(core_id);
> +
> +    ovs_mutex_lock(&ka_info->proclist_mutex);
> +    HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
> +                             &ka_info->process_list) {
> +        if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
> +            pinfo->core_state = state;
> +            pinfo->core_last_seen_times = last_alive;
> +        }
> +    }
> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
> +}
> +
> +/* Retrieve and return the keepalive timer interval from OVSDB. */
> +static uint32_t
> +ka_get_timer_interval(const struct smap *ovs_other_config OVS_UNUSED)
> +{
> +#define OVS_KEEPALIVE_TIMEOUT 1000    /* Default timeout set to 1000ms */
> +    uint32_t ka_interval;
> +
> +    /* Timer granularity in milliseconds
> +     * Defaults to OVS_KEEPALIVE_TIMEOUT(ms) if not set */
> +    ka_interval = smap_get_int(ovs_other_config, "keepalive-interval",
> +                  OVS_KEEPALIVE_TIMEOUT);
> +
> +    VLOG_INFO("Keepalive timer interval set to %"PRIu32" (ms)\n",
> ka_interval);
> +    return ka_interval;
> +}
> +
> +/*
> + * This function shall be invoked periodically to write the core status and
> + * last seen timestamp of the cores in to keepalive info structure.
> + */
> +void
> +ka_update_core_state(const int core_id, const enum keepalive_state core_state,
> +                     uint64_t last_alive, void *ptr_data OVS_UNUSED)
> +{
> +    switch (core_state) {
> +    case KA_STATE_ALIVE:
> +    case KA_STATE_MISSING:
> +        ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
> +        break;
> +    case KA_STATE_UNUSED:
> +    case KA_STATE_DOZING:
> +    case KA_STATE_SLEEP:
> +    case KA_STATE_DEAD:
> +    case KA_STATE_GONE:
> +        ka_set_pmd_state_ts(core_id, core_state, last_alive);
> +        break;

[Antonio]
Just for robustness - should core_state be corrupted for some reason -
I'd add the default switch case here, 
something like?
+    default:
+        VLOG_DBG("Unexpected %d value for core_state.", core_state);

or alternatively defining a __KA_STATE_MAX and using OVS_NOT_REACHED()?

> +    }
> +}
> +
> +static void
> +keepalive_register_relay_cb(ka_relay_cb cb, void *aux)
> +{
> +    ka_info->relay_cb = cb;
> +    ka_info->relay_cb_data = aux;
> +}
> +
> +static struct keepalive_info *
> +keepalive_info_create(void)
> +{
> +    struct keepalive_info *ka_info;
> +    ka_info = xzalloc(sizeof *ka_info);
> +    if (ka_info != NULL) {
> +        ka_info->init_time = time_wall_msec();
> +    }
> +
> +    return ka_info;
> +}
> +
> +static int
> +ka_init__(void)
> +{
> +    if ((ka_info = keepalive_info_create()) == NULL) {
> +        VLOG_ERR("OvS Keepalive - initialization failed.");
> +        return -1;
> +    }
> +
> +    keepalive_register_relay_cb(ka_update_core_state, NULL);
> +    ovs_mutex_init(&ka_info->proclist_mutex);
> +    hmap_init(&ka_info->process_list);
> +
> +    return 0;
> +}
> +
> +void
> +ka_init(const struct smap *ovs_other_config)
> +{
> +    if (smap_get_bool(ovs_other_config, "enable-keepalive", false)) {
> +        static struct ovsthread_once once_enable = OVSTHREAD_ONCE_INITIALIZER;
> +
> +        if (ovsthread_once_start(&once_enable)) {
> +            keepalive_enable = true;
> +            VLOG_INFO("OvS Keepalive enabled.");
> +
> +            keepalive_timer_interval =
> +                ka_get_timer_interval(ovs_other_config);
> +
> +            int err = ka_init__();
> +            if (!err) {
> +                VLOG_INFO("OvS Keepalive - initialized.");
> +                ka_init_status = ka_init_success;
> +            }
> +
> +            ovsthread_once_done(&once_enable);
> +        }
> +    }
> +}
> +
> +void
> +ka_destroy(void)
> +{
> +    if (ka_info) {
> +        struct ka_process_info *pinfo;
> +
> +        ovs_mutex_lock(&ka_info->proclist_mutex);
> +        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> +            free(pinfo);
> +        }
> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +
> +        hmap_destroy(&ka_info->process_list);
> +        ovs_mutex_destroy(&ka_info->proclist_mutex);
> +
> +        free(ka_info);
> +    }
> +}
> diff --git a/lib/keepalive.h b/lib/keepalive.h
> new file mode 100644
> index 0000000..d133398
> --- /dev/null
> +++ b/lib/keepalive.h
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (c) 2016 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef KEEPALIVE_H
> +#define KEEPALIVE_H
> +
> +#include <stdint.h>
> +#include "openvswitch/hmap.h"
> +#include "ovs-thread.h"
> +
> +#define KA_DP_MAXCORES 128
> +
> +struct smap;
> +
> +enum keepalive_state {
> +    KA_STATE_UNUSED = 0,
> +    KA_STATE_ALIVE = 1,
> +    KA_STATE_MISSING = 4,
> +    KA_STATE_DEAD = 2,
> +    KA_STATE_GONE = 3,
> +    KA_STATE_DOZING = 5,
> +    KA_STATE_SLEEP = 6,
> +};
> +
> +typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
> +
> +struct ka_process_info {
> +    int tid;
> +    int core_id;
> +    enum keepalive_state core_state;
> +    uint64_t core_last_seen_times;
> +    struct hmap_node node;
> +};
> +
> +struct keepalive_info {
> +    /* Mutex for 'process_list'. */
> +    struct ovs_mutex proclist_mutex;
> +
> +    /* List of process/threads monitored by KA framework. */
> +    struct hmap process_list OVS_GUARDED;
> +
> +    /* Store Datapath threads 'tid'.
> +     * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
> +    pid_t thread_id[KA_DP_MAXCORES];
> +
> +    /* Datapath cores state. */
> +    enum keepalive_state state_flags[KA_DP_MAXCORES];
> +
> +    uint64_t init_time;
> +    /* Last seen timestamps. */
> +    uint64_t last_alive[KA_DP_MAXCORES];
> +
> +    /* Active cores (non-zero if the core should be monitored). */
> +    uint8_t active_cores[KA_DP_MAXCORES];
> +
> +    /** Relay handler. */
> +    ka_relay_cb relay_cb;
> +    void *relay_cb_data;
> +};
> +
> +enum keepalive_status {
> +    ka_init_failure = 0,
> +    ka_init_success
> +};
> +
> +void ka_init(const struct smap *);
> +void ka_destroy(void);
> +void ka_set_pmd_state_ts(unsigned, enum keepalive_state, uint64_t);
> +void ka_update_core_state(const int, const enum keepalive_state,
> +                          uint64_t, void *);
> +
> +int ka_get_pmd_tid(unsigned core);
> +bool ka_is_enabled(void);
> +#endif /* keepalive.h */
> diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
> index a8cbae7..aaa8d9b 100644
> --- a/vswitchd/bridge.c
> +++ b/vswitchd/bridge.c
> @@ -34,6 +34,7 @@
>  #include "hmapx.h"
>  #include "if-notifier.h"
>  #include "jsonrpc.h"
> +#include "keepalive.h"
>  #include "lacp.h"
>  #include "mac-learning.h"
>  #include "mcast-snooping.h"
> @@ -506,6 +507,7 @@ bridge_exit(bool delete_datapath)
>          bridge_destroy(br, delete_datapath);
>      }
>      ovsdb_idl_destroy(idl);
> +    ka_destroy();
>  }
> 
>  /* Looks at the list of managers in 'ovs_cfg' and extracts their remote IP
> @@ -2953,6 +2955,7 @@ bridge_run(void)
>      if (cfg) {
>          netdev_set_flow_api_enabled(&cfg->other_config);
>          dpdk_init(&cfg->other_config);
> +        ka_init(&cfg->other_config);
>      }
> 
>      /* Initialize the ofproto library.  This only needs to run once, but
> diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
> index 90e50b6..c56a64c 100644
> --- a/vswitchd/vswitch.ovsschema
> +++ b/vswitchd/vswitch.ovsschema
> @@ -1,6 +1,6 @@
>  {"name": "Open_vSwitch",
> - "version": "7.15.1",
> - "cksum": "3682332033 23608",
> + "version": "7.16.0",
> + "cksum": "3631938350 23762",
>   "tables": {
>     "Open_vSwitch": {
>       "columns": {
> @@ -30,6 +30,10 @@
>           "type": {"key": "string", "value": "string",
>                    "min": 0, "max": "unlimited"},
>           "ephemeral": true},
> +       "keepalive": {
> +         "type": {"key": "string", "value": "string", "min": 0,
> +                  "max": "unlimited"},
> +         "ephemeral": true},
>         "ovs_version": {
>           "type": {"key": {"type": "string"},
>                    "min": 0, "max": 1}},
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 074535b..4302fd5 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -568,6 +568,55 @@
>            </p>
>          </column>
>        </group>
> +
> +      <group title="Keepalive">
> +        <p>
> +          The <code>keepalive</code> column contains key-value pairs that
> +          report health of datapath threads in Open vSwitch.  These are
> updated
> +          periodically (based on the keepalive-interval).
> +        </p>
> +
> +        <column name="other_config" key="enable-keepalive"
> +                type='{"type": "boolean"}'>
> +          Keepalive is disabled by default to avoid overhead in the common
> +          case when heartbeat monitoring is not useful.  Set this value to
> +          <code>true</code> to enable keepalive <ref column="keepalive"/>
> +          column or to <code>false</code> to explicitly disable it.
> +        </column>
> +
> +        <column name="other_config" key="keepalive-interval"
> +                type='{"type": "integer", "minInteger": 1}'>
> +          <p>
> +            Specifies the keepalive interval value.
> +          </p>
> +          <p>
> +            If not specified, this will be set to 1000 milliseconds (default
> +            value). Changing this value requires restarting the daemon.
> +          </p>
> +        </column>
> +
> +        <column name="keepalive" key="PMD_ID">
> +          <p>
> +            One such key-value pair, with <code>ID</code> replaced by the
> +            PMD thread, will exist for each active PMD thread.  The value is a
> +            comma-separated list of PMD thread status, core number and the
> +            last seen timestamp of PMD thread. In respective order, these

[Antonio]
Nit: one missing space 
               last seen timestamp of PMD thread.  In respective order, these

> +            values are:
> +          </p>
> +
> +          <ol>
> +            <li>Status of PMD thread.  Valid values include ALIVE, MISSING,
> +            DEAD, GONE, DOZING, SLEEPING.</li>

[Antonio]
Nit: should SLEEPING be changed into SLEEP as it is referred all over the code?
This is not so much for esthetics as for a cautious approach with the
state machine names and any future changes.

> +            <li>Core id of PMD thread.</li>
> +            <li>Last seen timestamp(epoch) of the PMD core.</li>
> +          </ol>
> +
> +          <p>
> +            This is only valid for OvS-DPDK Datapath and PMD threads status
> +            is implemented currently.
> +          </p>
> +        </column>
> +      </group>
>      </group>
> 
>      <group title="Version Reporting">
> --
> 2.4.11
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Aaron Conole Sept. 6, 2017, 8:33 p.m. UTC | #2
Hi Bhanu,

Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:

> This commit introduces the initial keepalive support by adding
> 'keepalive' module and also helper and initialization functions
> that will be invoked by later commits.
>
> This commit adds new ovsdb column "keepalive" that shows the status
> of the datapath threads. This is implemented for DPDK datapath and
> only status of PMD threads is reported.

I don't see the value in having this enabled / disabled flag?  Why not just
always have it on?

Additionally, even setting these true in this commit won't do anything.
No tracking starts until 3/7, afaict.

I guess it's okay to document here, but it might be worth stating that.

> For eg:
>   To enable keepalive feature.
>   'ovs-vsctl --no-wait set Open_vSwitch . other_config:enable-keepalive=true'

I'm not sure that a separate enable / disable flag is needed.

>   To set timer interval of 5000ms for monitoring packet processing cores.
>   'ovs-vsctl --no-wait set Open_vSwitch . \
>      other_config:keepalive-interval="5000"
>
> Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com>
> ---

As stated earlier, please add a Documentation/ update with this.

>  lib/automake.mk            |   2 +
>  lib/keepalive.c            | 183 +++++++++++++++++++++++++++++++++++++++++++++
>  lib/keepalive.h            |  87 +++++++++++++++++++++
>  vswitchd/bridge.c          |   3 +
>  vswitchd/vswitch.ovsschema |   8 +-
>  vswitchd/vswitch.xml       |  49 ++++++++++++
>  6 files changed, 330 insertions(+), 2 deletions(-)
>  create mode 100644 lib/keepalive.c
>  create mode 100644 lib/keepalive.h
>
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 2415f4c..0d99f0a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -110,6 +110,8 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/json.c \
>  	lib/jsonrpc.c \
>  	lib/jsonrpc.h \
> +	lib/keepalive.c \
> +	lib/keepalive.h \
>  	lib/lacp.c \
>  	lib/lacp.h \
>  	lib/latch.h \
> diff --git a/lib/keepalive.c b/lib/keepalive.c
> new file mode 100644
> index 0000000..ac73a42
> --- /dev/null
> +++ b/lib/keepalive.c
> @@ -0,0 +1,183 @@
> +/*
> + * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.

This line is not appropriately attributing the file.

> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <errno.h>
> +#include <fcntl.h>
> +#include <stdbool.h>
> +#include <unistd.h>
> +
> +#include "keepalive.h"
> +#include "lib/vswitch-idl.h"
> +#include "openvswitch/vlog.h"
> +#include "timeval.h"
> +
> +VLOG_DEFINE_THIS_MODULE(keepalive);
> +
> +static bool keepalive_enable = false;    /* Keepalive disabled by default */
> +static bool ka_init_status = ka_init_failure; /* Keepalive initialization */

You're assigning this bool a value from an enum.  I know that's probably
allowed, but it looks strange to me.  I would prefer that this type
either reflect the enum type or a true/false value is used instead.

> +static uint32_t keepalive_timer_interval;     /* keepalive timer interval */
> +static struct keepalive_info *ka_info = NULL;

Why allocate ka_info?  It will simplify some of the later code to just
keep it statically available.  It also means you can eliminate the
xzalloc() and free() calls you use later on in code.

Also, the nice thing about a static declaration is the structure will
already be 0 filled, and you'll know at program initialization time
whether it will succeed in getting the allocation.

> +
> +inline bool

The inline keyword is inappropriate in .c files.  Please let the
compiler do it's job.

> +ka_is_enabled(void)
> +{
> +    return keepalive_enable;
> +}
> +

I'm not sure about enable / disable.  In this case, I think the branches
are not needed at all.  Better to have this always enabled and just deal
with whatever fallout may come.

> +inline int

See previous comment about inline and c files.

> +ka_get_pmd_tid(unsigned core_idx)
> +{
> +    if (ka_is_enabled()) {
> +        return ka_info->thread_id[core_idx];
> +    }
> +
> +    return -EINVAL;

I would expect EINVAL for core_idx which addresses outside of the
thread_id range.

Actually, I'm wondering why have a "core-id" based lookup at all?  Keep
a map of tid which are being monitored.  tid will never change; core_idx
can be manipulated by outside forces (taskset comes to mind).

I'm not sure there's a useful way of correlating core / task usage.  If
the task can be moved by the scheduler then the only reliable way to
track this information is via the operating system (since the task
itself won't know where it runs if it's cpuset contains a mask of core
IDs).

I think having task based usage reported could be useful.  Core-based
usage is less so (since other processes may be scheduled on the core,
outside of OvS's control - and even the system operator may change the
core assignments).

Given that, I'd drop all core_id references.  Just use the task id, and
track that everywhere.  As an added benefit, I think it makes some of
the code simpler.

> +}
> +
> +void
> +ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
> +                    uint64_t last_alive)
> +{
> +    struct ka_process_info *pinfo;
> +    int tid = ka_get_pmd_tid(core_id);
> +
> +    ovs_mutex_lock(&ka_info->proclist_mutex);

If this ever gets called from a PMD context, this is a performance killer.

> +    HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
> +                             &ka_info->process_list) {
> +        if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {

Do we really need to check against both?

> +            pinfo->core_state = state;
> +            pinfo->core_last_seen_times = last_alive;
> +        }
> +    }
> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
> +}
> +
> +/* Retrieve and return the keepalive timer interval from OVSDB. */
> +static uint32_t
> +ka_get_timer_interval(const struct smap *ovs_other_config OVS_UNUSED)

The OVS_UNUSED here is not an appropriate annotation.

> +{
> +#define OVS_KEEPALIVE_TIMEOUT 1000    /* Default timeout set to 1000ms */

Move this definition to the header file, and put the word DEFAULT in it
somewhere.

> +    uint32_t ka_interval;
> +
> +    /* Timer granularity in milliseconds
> +     * Defaults to OVS_KEEPALIVE_TIMEOUT(ms) if not set */
> +    ka_interval = smap_get_int(ovs_other_config, "keepalive-interval",
> +                  OVS_KEEPALIVE_TIMEOUT);
> +
> +    VLOG_INFO("Keepalive timer interval set to %"PRIu32" (ms)\n", ka_interval);
> +    return ka_interval;
> +}
> +
> +/*
> + * This function shall be invoked periodically to write the core status and
> + * last seen timestamp of the cores in to keepalive info structure.
> + */
> +void
> +ka_update_core_state(const int core_id, const enum keepalive_state core_state,
> +                     uint64_t last_alive, void *ptr_data OVS_UNUSED)

Don't have an OVS_UNUSED parameter here.  It doesn't make sense.  Add it
when it is time.

> +{
> +    switch (core_state) {
> +    case KA_STATE_ALIVE:
> +    case KA_STATE_MISSING:
> +        ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
> +        break;
> +    case KA_STATE_UNUSED:
> +    case KA_STATE_DOZING:
> +    case KA_STATE_SLEEP:
> +    case KA_STATE_DEAD:
> +    case KA_STATE_GONE:
> +        ka_set_pmd_state_ts(core_id, core_state, last_alive);
> +        break;
> +    }
> +}
> +
> +static void
> +keepalive_register_relay_cb(ka_relay_cb cb, void *aux)
> +{
> +    ka_info->relay_cb = cb;
> +    ka_info->relay_cb_data = aux;
> +}
> +
> +static struct keepalive_info *
> +keepalive_info_create(void)
> +{
> +    struct keepalive_info *ka_info;
> +    ka_info = xzalloc(sizeof *ka_info);
> +    if (ka_info != NULL) {

This condition tests for a case that can never occur.  I also think
there should be no need for a dynamic allocation here.

> +        ka_info->init_time = time_wall_msec();
> +    }
> +
> +    return ka_info;
> +}
> +
> +static int
> +ka_init__(void)
> +{
> +    if ((ka_info = keepalive_info_create()) == NULL) {

Likewise.

> +        VLOG_ERR("OvS Keepalive - initialization failed.");
> +        return -1;
> +    }
> +
> +    keepalive_register_relay_cb(ka_update_core_state, NULL);
> +    ovs_mutex_init(&ka_info->proclist_mutex);
> +    hmap_init(&ka_info->process_list);
> +
> +    return 0;
> +}
> +
> +void
> +ka_init(const struct smap *ovs_other_config)
> +{
> +    if (smap_get_bool(ovs_other_config, "enable-keepalive", false)) {

I think because of the way this is written, it is possible to change the
keepalive from 'false' to 'true' (but not the other way) at runtime.

Regardless, I think this toggle shouldn't exist.

> +        static struct ovsthread_once once_enable = OVSTHREAD_ONCE_INITIALIZER;
> +
> +        if (ovsthread_once_start(&once_enable)) {
> +            keepalive_enable = true;
> +            VLOG_INFO("OvS Keepalive enabled.");
> +
> +            keepalive_timer_interval =
> +                ka_get_timer_interval(ovs_other_config);
> +
> +            int err = ka_init__();
> +            if (!err) {
> +                VLOG_INFO("OvS Keepalive - initialized.");
> +                ka_init_status = ka_init_success;
> +            }
> +
> +            ovsthread_once_done(&once_enable);
> +        }
> +    }
> +}
> +
> +void
> +ka_destroy(void)
> +{
> +    if (ka_info) {
> +        struct ka_process_info *pinfo;
> +
> +        ovs_mutex_lock(&ka_info->proclist_mutex);
> +        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
> +            free(pinfo);
> +        }
> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
> +
> +        hmap_destroy(&ka_info->process_list);
> +        ovs_mutex_destroy(&ka_info->proclist_mutex);
> +
> +        free(ka_info);

if the dynamic allocation is removed, much of this can be corrected.

> +    }
> +}
> diff --git a/lib/keepalive.h b/lib/keepalive.h
> new file mode 100644
> index 0000000..d133398
> --- /dev/null
> +++ b/lib/keepalive.h
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (c) 2016 Nicira, Inc.

This line is not appropriately attributing the file.

> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef KEEPALIVE_H
> +#define KEEPALIVE_H
> +
> +#include <stdint.h>
> +#include "openvswitch/hmap.h"
> +#include "ovs-thread.h"
> +
> +#define KA_DP_MAXCORES 128

This doesn't make sense to me.  Track by task/thread ID.  Don't use a
fixed sized array - use a map and index by the task id.  When you need
to iterate through all, MAP_FOR_EACH.

> +
> +struct smap;
> +
> +enum keepalive_state {
> +    KA_STATE_UNUSED = 0,
> +    KA_STATE_ALIVE = 1,
> +    KA_STATE_MISSING = 4,
> +    KA_STATE_DEAD = 2,
> +    KA_STATE_GONE = 3,
> +    KA_STATE_DOZING = 5,
> +    KA_STATE_SLEEP = 6,
> +};

Again, please describe these states.

It looks strange to me to declare an enum this way.  The compiler will
do all of this for you.

> +
> +typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
> +
> +struct ka_process_info {
> +    int tid;
> +    int core_id;

Since threads / processes can be bound to multiple cpus, use cpu_set_t
rather than a simple int.

> +    enum keepalive_state core_state;
> +    uint64_t core_last_seen_times;
> +    struct hmap_node node;
> +};
> +
> +struct keepalive_info {

Does this need to be exposed in this header?  I think not.

> +    /* Mutex for 'process_list'. */
> +    struct ovs_mutex proclist_mutex;
> +
> +    /* List of process/threads monitored by KA framework. */
> +    struct hmap process_list OVS_GUARDED;

Maybe a cmap instead.

> +
> +    /* Store Datapath threads 'tid'.
> +     * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
> +    pid_t thread_id[KA_DP_MAXCORES];
> +
> +    /* Datapath cores state. */
> +    enum keepalive_state state_flags[KA_DP_MAXCORES];
> +
> +    uint64_t init_time;
> +    /* Last seen timestamps. */
> +    uint64_t last_alive[KA_DP_MAXCORES];
> +
> +    /* Active cores (non-zero if the core should be monitored). */
> +    uint8_t active_cores[KA_DP_MAXCORES];

I guess this is a hold over from the older 'track cores' version.  The
KA framework will monitor tasks, not cores, yes?  In that case, let's
drop all the core_id, core_idx, etc.

> +
> +    /** Relay handler. */
> +    ka_relay_cb relay_cb;
> +    void *relay_cb_data;
> +};
> +
> +enum keepalive_status {
> +    ka_init_failure = 0,
> +    ka_init_success
> +};
> +
> +void ka_init(const struct smap *);
> +void ka_destroy(void);
> +void ka_set_pmd_state_ts(unsigned, enum keepalive_state, uint64_t);
> +void ka_update_core_state(const int, const enum keepalive_state,
> +                          uint64_t, void *);
> +
> +int ka_get_pmd_tid(unsigned core);

I'm assuming all of the non-task id stuff will get removed (since we
should only report on task/thread ID - that's all we can reliably know).

> +bool ka_is_enabled(void);

Again, let's remove the enabled/disabled flag if we're going to have
this feature.

> +#endif /* keepalive.h */
> diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
> index a8cbae7..aaa8d9b 100644
> --- a/vswitchd/bridge.c
> +++ b/vswitchd/bridge.c
> @@ -34,6 +34,7 @@
>  #include "hmapx.h"
>  #include "if-notifier.h"
>  #include "jsonrpc.h"
> +#include "keepalive.h"
>  #include "lacp.h"
>  #include "mac-learning.h"
>  #include "mcast-snooping.h"
> @@ -506,6 +507,7 @@ bridge_exit(bool delete_datapath)
>          bridge_destroy(br, delete_datapath);
>      }
>      ovsdb_idl_destroy(idl);
> +    ka_destroy();
>  }
>  
>  /* Looks at the list of managers in 'ovs_cfg' and extracts their remote IP
> @@ -2953,6 +2955,7 @@ bridge_run(void)
>      if (cfg) {
>          netdev_set_flow_api_enabled(&cfg->other_config);
>          dpdk_init(&cfg->other_config);
> +        ka_init(&cfg->other_config);
>      }
>  
>      /* Initialize the ofproto library.  This only needs to run once, but
> diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
> index 90e50b6..c56a64c 100644
> --- a/vswitchd/vswitch.ovsschema
> +++ b/vswitchd/vswitch.ovsschema
> @@ -1,6 +1,6 @@
>  {"name": "Open_vSwitch",
> - "version": "7.15.1",
> - "cksum": "3682332033 23608",
> + "version": "7.16.0",
> + "cksum": "3631938350 23762",
>   "tables": {
>     "Open_vSwitch": {
>       "columns": {
> @@ -30,6 +30,10 @@
>           "type": {"key": "string", "value": "string",
>                    "min": 0, "max": "unlimited"},
>           "ephemeral": true},
> +       "keepalive": {
> +         "type": {"key": "string", "value": "string", "min": 0,
> +                  "max": "unlimited"},
> +         "ephemeral": true},
>         "ovs_version": {
>           "type": {"key": {"type": "string"},
>                    "min": 0, "max": 1}},
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 074535b..4302fd5 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -568,6 +568,55 @@
>            </p>
>          </column>
>        </group>
> +
> +      <group title="Keepalive">
> +        <p>
> +          The <code>keepalive</code> column contains key-value pairs that
> +          report health of datapath threads in Open vSwitch.  These are updated
> +          periodically (based on the keepalive-interval).
> +        </p>
> +
> +        <column name="other_config" key="enable-keepalive"
> +                type='{"type": "boolean"}'>
> +          Keepalive is disabled by default to avoid overhead in the common
> +          case when heartbeat monitoring is not useful.  Set this value to
> +          <code>true</code> to enable keepalive <ref column="keepalive"/>
> +          column or to <code>false</code> to explicitly disable it.
> +        </column>
> +
> +        <column name="other_config" key="keepalive-interval"
> +                type='{"type": "integer", "minInteger": 1}'>
> +          <p>
> +            Specifies the keepalive interval value.

Add 'in ms.' or 'in milliseconds.'

I think a minimum value of 1 is far too aggressive.  Probably this
should be in 10 ms increments, and a minimum value of 1 would make a
little bit more sense.  In that case, a default value of 100 would also
make sense.

> +          </p>
> +          <p>
> +            If not specified, this will be set to 1000 milliseconds (default
> +            value). Changing this value requires restarting the daemon.
> +          </p>
> +        </column>
> +
> +        <column name="keepalive" key="PMD_ID">
> +          <p>
> +            One such key-value pair, with <code>ID</code> replaced by the
> +            PMD thread, will exist for each active PMD thread.  The value is a
> +            comma-separated list of PMD thread status, core number and the
> +            last seen timestamp of PMD thread. In respective order, these
> +            values are:
> +          </p>
> +
> +          <ol>
> +            <li>Status of PMD thread.  Valid values include ALIVE, MISSING,
> +            DEAD, GONE, DOZING, SLEEPING.</li>

It would be good in the documentation to tell what these states mean,
and what would be expected.  For instance - dead vs. gone?  What about
dozing vs. sleeping.  These states aren't apparent and don't have any
kind of match with any kind of process states (think START, READY,
RUNNING, WAITING, TERMINATED).

> +            <li>Core id of PMD thread.</li>

I think you mean task id (or name?), here.

> +            <li>Last seen timestamp(epoch) of the PMD core.</li>
> +          </ol>
> +
> +          <p>
> +            This is only valid for OvS-DPDK Datapath and PMD threads status
> +            is implemented currently.
> +          </p>
> +        </column>
> +      </group>
>      </group>
>  
>      <group title="Version Reporting">
Bodireddy, Bhanuprakash Sept. 7, 2017, 3:35 p.m. UTC | #3
Hi Aaron,

My reply inline.

>Hi Bhanu,
>
>Bhanuprakash Bodireddy <bhanuprakash.bodireddy@intel.com> writes:
>
>> This commit introduces the initial keepalive support by adding
>> 'keepalive' module and also helper and initialization functions that
>> will be invoked by later commits.
>>
>> This commit adds new ovsdb column "keepalive" that shows the status of
>> the datapath threads. This is implemented for DPDK datapath and only
>> status of PMD threads is reported.
>
>I don't see the value in having this enabled / disabled flag?  Why not just
>always have it on?

[BHANU]  

I was following the conventions here.  
OvS statistics is done similar way where the stats can be enabled with 'other_config:enable-statistics=true' with default being false.
Maybe this is done as additional thread (system_stats) will be spawned to handle the functionality and users should have an option to turn them on/off. 

>
>Additionally, even setting these true in this commit won't do anything.
>No tracking starts until 3/7, afaict.
>
>I guess it's okay to document here, but it might be worth stating that.

[BHANU]  Ok. 

>
>> For eg:
>>   To enable keepalive feature.
>>   'ovs-vsctl --no-wait set Open_vSwitch . other_config:enable-
>keepalive=true'
>
>I'm not sure that a separate enable / disable flag is needed.
>
>>   To set timer interval of 5000ms for monitoring packet processing cores.
>>   'ovs-vsctl --no-wait set Open_vSwitch . \
>>      other_config:keepalive-interval="5000"
>>
>> Signed-off-by: Bhanuprakash Bodireddy
>> <bhanuprakash.bodireddy@intel.com>
>> ---
>
>As stated earlier, please add a Documentation/ update with this.

[BHANU]  I would add the documentation in the respin.  

>
>>  lib/automake.mk            |   2 +
>>  lib/keepalive.c            | 183
>+++++++++++++++++++++++++++++++++++++++++++++
>>  lib/keepalive.h            |  87 +++++++++++++++++++++
>>  vswitchd/bridge.c          |   3 +
>>  vswitchd/vswitch.ovsschema |   8 +-
>>  vswitchd/vswitch.xml       |  49 ++++++++++++
>>  6 files changed, 330 insertions(+), 2 deletions(-)  create mode
>> 100644 lib/keepalive.c  create mode 100644 lib/keepalive.h
>>
>> diff --git a/lib/automake.mk b/lib/automake.mk index 2415f4c..0d99f0a
>> 100644
>> --- a/lib/automake.mk
>> +++ b/lib/automake.mk
>> @@ -110,6 +110,8 @@ lib_libopenvswitch_la_SOURCES = \
>>  	lib/json.c \
>>  	lib/jsonrpc.c \
>>  	lib/jsonrpc.h \
>> +	lib/keepalive.c \
>> +	lib/keepalive.h \
>>  	lib/lacp.c \
>>  	lib/lacp.h \
>>  	lib/latch.h \
>> diff --git a/lib/keepalive.c b/lib/keepalive.c new file mode 100644
>> index 0000000..ac73a42
>> --- /dev/null
>> +++ b/lib/keepalive.c
>> @@ -0,0 +1,183 @@
>> +/*
>> + * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
>
>This line is not appropriately attributing the file.

[BHANU]  Should be "Copyright (c) 2017 Intel, Inc."

>
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing,
>> + software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>implied.
>> + * See the License for the specific language governing permissions
>> + and
>> + * limitations under the License.
>> + */
>> +
>> +#include <config.h>
>> +#include <errno.h>
>> +#include <fcntl.h>
>> +#include <stdbool.h>
>> +#include <unistd.h>
>> +
>> +#include "keepalive.h"
>> +#include "lib/vswitch-idl.h"
>> +#include "openvswitch/vlog.h"
>> +#include "timeval.h"
>> +
>> +VLOG_DEFINE_THIS_MODULE(keepalive);
>> +
>> +static bool keepalive_enable = false;    /* Keepalive disabled by default */
>> +static bool ka_init_status = ka_init_failure; /* Keepalive
>> +initialization */
>
>You're assigning this bool a value from an enum.  I know that's probably
>allowed, but it looks strange to me.  I would prefer that this type either reflect
>the enum type or a true/false value is used instead.

[BHANU]   OK.

>
>> +static uint32_t keepalive_timer_interval;     /* keepalive timer interval */
>> +static struct keepalive_info *ka_info = NULL;
>
>Why allocate ka_info?  It will simplify some of the later code to just keep it
>statically available.  It also means you can eliminate the
>xzalloc() and free() calls you use later on in code.

[BHANU]   Ok, saves me few lines of code. 

>
>Also, the nice thing about a static declaration is the structure will already be 0
>filled, and you'll know at program initialization time whether it will succeed in
>getting the allocation.
>
>> +
>> +inline bool
>
>The inline keyword is inappropriate in .c files.  Please let the compiler do it's
>job.

[BHANU]   Ok

>
>> +ka_is_enabled(void)
>> +{
>> +    return keepalive_enable;
>> +}
>> +
>
>I'm not sure about enable / disable.  In this case, I think the branches are not
>needed at all.  Better to have this always enabled and just deal with whatever
>fallout may come.

[BHANU]  Would like to retain this so that users can disable it if they don't like additional
thread running and constant updates to OvSDB. 

>
>> +inline int
>
>See previous comment about inline and c files.
>
>> +ka_get_pmd_tid(unsigned core_idx)
>> +{
>> +    if (ka_is_enabled()) {
>> +        return ka_info->thread_id[core_idx];
>> +    }
>> +
>> +    return -EINVAL;
>
>I would expect EINVAL for core_idx which addresses outside of the thread_id
>range.
>
>Actually, I'm wondering why have a "core-id" based lookup at all?  Keep a map
>of tid which are being monitored.  tid will never change; core_idx can be
>manipulated by outside forces (taskset comes to mind).

[BHANU]
May be there is some confusion here. Please see below for additional details.

As tid is unique across the system, tid is used for hash calculation and same is used
while registering/unregistering the threads to/from KA framework and updating the information.
(See ka_register_thread(tid,.. ),  ka_unregister_thread(), ka_set_pmd_state_ts())

For below reasons you see reference to core-id in the code:
       - In most cases it doesn't make sense to correlate thread and core as multiple thread can be scheduled on same core,
         but in case of PMDs there is 1:1 mapping as a PMD thread is pinned to specific core.
         So its fine to do a lookup based on core-id for PMD thread and store/retrieve the PMD tid in an array indexed by core-id. 

         We need PMD's  tid to register/unregister a PMD to KA framework. we retrieve the tid of the PMD using gettid() and store it in a array i.e
         'pid_t thread_id[KA_DP_MAXCORES]'  (KA_DP_MAXCORES(128) is the maximum number of cores/PMDs the compute can support.)

      -  please note the other arrays(state_flags, last_alive, active_cores) in Keepalive_info struct are used only by PMD threads and not for other threads. 
         This is done so that Keepalive and PMD thread can update the information in lockless way and doesn't impact the performance. This is inspired from rte_keepalive library.
         Look at dispatch_heartbeats() and ka_mark_pmd_thread_alive().

>
>I'm not sure there's a useful way of correlating core / task usage.  If the task
>can be moved by the scheduler then the only reliable way to track this
>information is via the operating system (since the task itself won't know
>where it runs if it's cpuset contains a mask of core IDs).
>
>I think having task based usage reported could be useful.  Core-based usage is
>less so (since other processes may be scheduled on the core, outside of OvS's
>control - and even the system operator may change the core assignments).
>
>Given that, I'd drop all core_id references.  Just use the task id, and track that
>everywhere.  As an added benefit, I think it makes some of the code simpler.

>> +}
>> +
>> +void
>> +ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
>> +                    uint64_t last_alive) {
>> +    struct ka_process_info *pinfo;
>> +    int tid = ka_get_pmd_tid(core_id);
>> +
>> +    ovs_mutex_lock(&ka_info->proclist_mutex);
>
>If this ever gets called from a PMD context, this is a performance killer.

[BHANU]  This is only called from Keepalive thread, so no worries on datapath performance!

CP:  ovs_keepalive()  -> dispatch_heartbeats()  -> relay_cb() ->  ka_update_core_state() - > ka_set_pmd_state_ts(). 
                     
Infact this is what I was trying to explain in previous case.
We want to maintain the temporary state transition info of PMDs in Keepalive_info struct  and update the 
state and time information from Keepalive thread context.

>
>> +    HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
>> +                             &ka_info->process_list) {
>> +        if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
>
>Do we really need to check against both?

[BHANU] Not needed, tid check is enough as it is unique. Will remove unnecessary core check. 

>
>> +            pinfo->core_state = state;
>> +            pinfo->core_last_seen_times = last_alive;
>> +        }
>> +    }
>> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +}
>> +
>> +/* Retrieve and return the keepalive timer interval from OVSDB. */
>> +static uint32_t ka_get_timer_interval(const struct smap
>> +*ovs_other_config OVS_UNUSED)
>
>The OVS_UNUSED here is not an appropriate annotation.

[BHANU]  Mistake, will remove this.

>
>> +{
>> +#define OVS_KEEPALIVE_TIMEOUT 1000    /* Default timeout set to
>1000ms */
>
>Move this definition to the header file, and put the word DEFAULT in it
>somewhere.

[BHANU]  Ok.

>
>> +    uint32_t ka_interval;
>> +
>> +    /* Timer granularity in milliseconds
>> +     * Defaults to OVS_KEEPALIVE_TIMEOUT(ms) if not set */
>> +    ka_interval = smap_get_int(ovs_other_config, "keepalive-interval",
>> +                  OVS_KEEPALIVE_TIMEOUT);
>> +
>> +    VLOG_INFO("Keepalive timer interval set to %"PRIu32" (ms)\n",
>ka_interval);
>> +    return ka_interval;
>> +}
>> +
>> +/*
>> + * This function shall be invoked periodically to write the core
>> +status and
>> + * last seen timestamp of the cores in to keepalive info structure.
>> + */
>> +void
>> +ka_update_core_state(const int core_id, const enum keepalive_state
>core_state,
>> +                     uint64_t last_alive, void *ptr_data OVS_UNUSED)
>
>Don't have an OVS_UNUSED parameter here.  It doesn't make sense.  Add it
>when it is time.

[BHANU]  Thanks. When I merged the patches and forget to clean up this old changes.

>
>> +{
>> +    switch (core_state) {
>> +    case KA_STATE_ALIVE:
>> +    case KA_STATE_MISSING:
>> +        ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
>> +        break;
>> +    case KA_STATE_UNUSED:
>> +    case KA_STATE_DOZING:
>> +    case KA_STATE_SLEEP:
>> +    case KA_STATE_DEAD:
>> +    case KA_STATE_GONE:
>> +        ka_set_pmd_state_ts(core_id, core_state, last_alive);
>> +        break;
>> +    }
>> +}
>> +
>> +static void
>> +keepalive_register_relay_cb(ka_relay_cb cb, void *aux) {
>> +    ka_info->relay_cb = cb;
>> +    ka_info->relay_cb_data = aux;
>> +}
>> +
>> +static struct keepalive_info *
>> +keepalive_info_create(void)
>> +{
>> +    struct keepalive_info *ka_info;
>> +    ka_info = xzalloc(sizeof *ka_info);
>> +    if (ka_info != NULL) {
>
>This condition tests for a case that can never occur.  I also think there should
>be no need for a dynamic allocation here.

[BHANU] Let's have static structure as suggested earlier and this above checks can be removed. 

>
>> +        ka_info->init_time = time_wall_msec();
>> +    }
>> +
>> +    return ka_info;
>> +}
>> +
>> +static int
>> +ka_init__(void)
>> +{
>> +    if ((ka_info = keepalive_info_create()) == NULL) {
>
>Likewise.
>
>> +        VLOG_ERR("OvS Keepalive - initialization failed.");
>> +        return -1;
>> +    }
>> +
>> +    keepalive_register_relay_cb(ka_update_core_state, NULL);
>> +    ovs_mutex_init(&ka_info->proclist_mutex);
>> +    hmap_init(&ka_info->process_list);
>> +
>> +    return 0;
>> +}
>> +
>> +void
>> +ka_init(const struct smap *ovs_other_config) {
>> +    if (smap_get_bool(ovs_other_config, "enable-keepalive", false)) {
>
>I think because of the way this is written, it is possible to change the keepalive
>from 'false' to 'true' (but not the other way) at runtime.
>
>Regardless, I think this toggle shouldn't exist.

[BHANU] I will check this. 

>
>> +        static struct ovsthread_once once_enable =
>> + OVSTHREAD_ONCE_INITIALIZER;
>> +
>> +        if (ovsthread_once_start(&once_enable)) {
>> +            keepalive_enable = true;
>> +            VLOG_INFO("OvS Keepalive enabled.");
>> +
>> +            keepalive_timer_interval =
>> +                ka_get_timer_interval(ovs_other_config);
>> +
>> +            int err = ka_init__();
>> +            if (!err) {
>> +                VLOG_INFO("OvS Keepalive - initialized.");
>> +                ka_init_status = ka_init_success;
>> +            }
>> +
>> +            ovsthread_once_done(&once_enable);
>> +        }
>> +    }
>> +}
>> +
>> +void
>> +ka_destroy(void)
>> +{
>> +    if (ka_info) {
>> +        struct ka_process_info *pinfo;
>> +
>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>> +        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>> +            free(pinfo);
>> +        }
>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +
>> +        hmap_destroy(&ka_info->process_list);
>> +        ovs_mutex_destroy(&ka_info->proclist_mutex);
>> +
>> +        free(ka_info);
>
>if the dynamic allocation is removed, much of this can be corrected.

[BHANU] right.

>
>> +    }
>> +}
>> diff --git a/lib/keepalive.h b/lib/keepalive.h new file mode 100644
>> index 0000000..d133398
>> --- /dev/null
>> +++ b/lib/keepalive.h
>> @@ -0,0 +1,87 @@
>> +/*
>> + * Copyright (c) 2016 Nicira, Inc.
>
>This line is not appropriately attributing the file.

[BHANU]  Should be "Copyright (c) 2017 Intel, Inc."

>
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing,
>> + software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>implied.
>> + * See the License for the specific language governing permissions
>> + and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef KEEPALIVE_H
>> +#define KEEPALIVE_H
>> +
>> +#include <stdint.h>
>> +#include "openvswitch/hmap.h"
>> +#include "ovs-thread.h"
>> +
>> +#define KA_DP_MAXCORES 128
>
>This doesn't make sense to me.  Track by task/thread ID.  Don't use a fixed
>sized array - use a map and index by the task id.  When you need to iterate
>through all, MAP_FOR_EACH.

[BHANU]  The array here isn't for iterating the threads. As documented below,
this is temporarily used by PMD threads & Keepalive thread to update the states
which later gets written in to process_list map by Keepalive thread. Please note 
I do iterate using MAP_FOR_EACH. For example check get_pmd_status() on how I iterate the threads. 

>
>> +
>> +struct smap;
>> +
>> +enum keepalive_state {
>> +    KA_STATE_UNUSED = 0,
>> +    KA_STATE_ALIVE = 1,
>> +    KA_STATE_MISSING = 4,
>> +    KA_STATE_DEAD = 2,
>> +    KA_STATE_GONE = 3,
>> +    KA_STATE_DOZING = 5,
>> +    KA_STATE_SLEEP = 6,
>> +};
>
>Again, please describe these states.
>
>It looks strange to me to declare an enum this way.  The compiler will do all of
>this for you.

[BHANU] It's taken from rte_keepalive.h,  I will fix it anyways.  

>
>> +
>> +typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void
>> +*);
>> +
>> +struct ka_process_info {
>> +    int tid;
>> +    int core_id;
>
>Since threads / processes can be bound to multiple cpus, use cpu_set_t rather
>than a simple int.

[BHANU]  Ok fine. 

>
>> +    enum keepalive_state core_state;
>> +    uint64_t core_last_seen_times;
>> +    struct hmap_node node;
>> +};
>> +
>> +struct keepalive_info {
>
>Does this need to be exposed in this header?  I think not.

[BHANU] I should check this, I remember the code(larger patch series) in v3  needed this in header file. 

>
>> +    /* Mutex for 'process_list'. */
>> +    struct ovs_mutex proclist_mutex;
>> +
>> +    /* List of process/threads monitored by KA framework. */
>> +    struct hmap process_list OVS_GUARDED;
>
>Maybe a cmap instead.

[BHANU] AFAIK, cmap is needed if I have multiple readers and a writer, otherwise hmaps are sufficient.
In this case I think hmap is sufficient. 

>
>> +
>> +    /* Store Datapath threads 'tid'.
>> +     * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
>> +    pid_t thread_id[KA_DP_MAXCORES];
>> +
>> +    /* Datapath cores state. */
>> +    enum keepalive_state state_flags[KA_DP_MAXCORES];
>> +
>> +    uint64_t init_time;
>> +    /* Last seen timestamps. */
>> +    uint64_t last_alive[KA_DP_MAXCORES];
>> +
>> +    /* Active cores (non-zero if the core should be monitored). */
>> +    uint8_t active_cores[KA_DP_MAXCORES];
>
>I guess this is a hold over from the older 'track cores' version.  The KA
>framework will monitor tasks, not cores, yes?  In that case, let's drop all the
>core_id, core_idx, etc.

[BHANU]  As mentioned earlier, the above is used by PMD thread & Keepalive thread to
update state & time information of PMD threads. As there is 1:1 mapping between
PMD thread and core we can use this when tracking PMD threads and inspired from rte_keepalive library.

>
>> +
>> +    /** Relay handler. */
>> +    ka_relay_cb relay_cb;
>> +    void *relay_cb_data;
>> +};
>> +
>> +enum keepalive_status {
>> +    ka_init_failure = 0,
>> +    ka_init_success
>> +};
>> +
>> +void ka_init(const struct smap *);
>> +void ka_destroy(void);
>> +void ka_set_pmd_state_ts(unsigned, enum keepalive_state, uint64_t);
>> +void ka_update_core_state(const int, const enum keepalive_state,
>> +                          uint64_t, void *);
>> +
>> +int ka_get_pmd_tid(unsigned core);
>
>I'm assuming all of the non-task id stuff will get removed (since we should
>only report on task/thread ID - that's all we can reliably know).

[BHANU]  This is still valid for PMD cores. We can retrieve the thread id of PMD using the core-id.

>
>> +bool ka_is_enabled(void);
>
>Again, let's remove the enabled/disabled flag if we're going to have this
>feature.
>
>> +#endif /* keepalive.h */
>> diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index
>> a8cbae7..aaa8d9b 100644
>> --- a/vswitchd/bridge.c
>> +++ b/vswitchd/bridge.c
>> @@ -34,6 +34,7 @@
>>  #include "hmapx.h"
>>  #include "if-notifier.h"
>>  #include "jsonrpc.h"
>> +#include "keepalive.h"
>>  #include "lacp.h"
>>  #include "mac-learning.h"
>>  #include "mcast-snooping.h"
>> @@ -506,6 +507,7 @@ bridge_exit(bool delete_datapath)
>>          bridge_destroy(br, delete_datapath);
>>      }
>>      ovsdb_idl_destroy(idl);
>> +    ka_destroy();
>>  }
>>
>>  /* Looks at the list of managers in 'ovs_cfg' and extracts their
>> remote IP @@ -2953,6 +2955,7 @@ bridge_run(void)
>>      if (cfg) {
>>          netdev_set_flow_api_enabled(&cfg->other_config);
>>          dpdk_init(&cfg->other_config);
>> +        ka_init(&cfg->other_config);
>>      }
>>
>>      /* Initialize the ofproto library.  This only needs to run once,
>> but diff --git a/vswitchd/vswitch.ovsschema
>> b/vswitchd/vswitch.ovsschema index 90e50b6..c56a64c 100644
>> --- a/vswitchd/vswitch.ovsschema
>> +++ b/vswitchd/vswitch.ovsschema
>> @@ -1,6 +1,6 @@
>>  {"name": "Open_vSwitch",
>> - "version": "7.15.1",
>> - "cksum": "3682332033 23608",
>> + "version": "7.16.0",
>> + "cksum": "3631938350 23762",
>>   "tables": {
>>     "Open_vSwitch": {
>>       "columns": {
>> @@ -30,6 +30,10 @@
>>           "type": {"key": "string", "value": "string",
>>                    "min": 0, "max": "unlimited"},
>>           "ephemeral": true},
>> +       "keepalive": {
>> +         "type": {"key": "string", "value": "string", "min": 0,
>> +                  "max": "unlimited"},
>> +         "ephemeral": true},
>>         "ovs_version": {
>>           "type": {"key": {"type": "string"},
>>                    "min": 0, "max": 1}}, diff --git
>> a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 074535b..4302fd5
>> 100644
>> --- a/vswitchd/vswitch.xml
>> +++ b/vswitchd/vswitch.xml
>> @@ -568,6 +568,55 @@
>>            </p>
>>          </column>
>>        </group>
>> +
>> +      <group title="Keepalive">
>> +        <p>
>> +          The <code>keepalive</code> column contains key-value pairs that
>> +          report health of datapath threads in Open vSwitch.  These are
>updated
>> +          periodically (based on the keepalive-interval).
>> +        </p>
>> +
>> +        <column name="other_config" key="enable-keepalive"
>> +                type='{"type": "boolean"}'>
>> +          Keepalive is disabled by default to avoid overhead in the common
>> +          case when heartbeat monitoring is not useful.  Set this value to
>> +          <code>true</code> to enable keepalive <ref column="keepalive"/>
>> +          column or to <code>false</code> to explicitly disable it.
>> +        </column>
>> +
>> +        <column name="other_config" key="keepalive-interval"
>> +                type='{"type": "integer", "minInteger": 1}'>
>> +          <p>
>> +            Specifies the keepalive interval value.
>
>Add 'in ms.' or 'in milliseconds.'
>
>I think a minimum value of 1 is far too aggressive.  Probably this should be in
>10 ms increments, and a minimum value of 1 would make a little bit more
>sense.  In that case, a default value of 100 would also make sense.

[BHANU] OK. At present 1000ms is default value. 

>
>> +          </p>
>> +          <p>
>> +            If not specified, this will be set to 1000 milliseconds (default
>> +            value). Changing this value requires restarting the daemon.
>> +          </p>
>> +        </column>
>> +
>> +        <column name="keepalive" key="PMD_ID">
>> +          <p>
>> +            One such key-value pair, with <code>ID</code> replaced by the
>> +            PMD thread, will exist for each active PMD thread.  The value is a
>> +            comma-separated list of PMD thread status, core number and the
>> +            last seen timestamp of PMD thread. In respective order, these
>> +            values are:
>> +          </p>
>> +
>> +          <ol>
>> +            <li>Status of PMD thread.  Valid values include ALIVE, MISSING,
>> +            DEAD, GONE, DOZING, SLEEPING.</li>
>
>It would be good in the documentation to tell what these states mean, and
>what would be expected.  For instance - dead vs. gone?  What about dozing
>vs. sleeping.  These states aren't apparent and don't have any kind of match
>with any kind of process states (think START, READY, RUNNING, WAITING,
>TERMINATED).

[BHANU] OK. 

>
>> +            <li>Core id of PMD thread.</li>
>
>I think you mean task id (or name?), here.
>
>> +            <li>Last seen timestamp(epoch) of the PMD core.</li>
>> +          </ol>
>> +
>> +          <p>
>> +            This is only valid for OvS-DPDK Datapath and PMD threads status
>> +            is implemented currently.
>> +          </p>
>> +        </column>
>> +      </group>
>>      </group>
>>
>>      <group title="Version Reporting">

- Bhanuprakash.
diff mbox

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 2415f4c..0d99f0a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -110,6 +110,8 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/json.c \
 	lib/jsonrpc.c \
 	lib/jsonrpc.h \
+	lib/keepalive.c \
+	lib/keepalive.h \
 	lib/lacp.c \
 	lib/lacp.h \
 	lib/latch.h \
diff --git a/lib/keepalive.c b/lib/keepalive.c
new file mode 100644
index 0000000..ac73a42
--- /dev/null
+++ b/lib/keepalive.c
@@ -0,0 +1,183 @@ 
+/*
+ * Copyright (c) 2014, 2015, 2016, 2017 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <unistd.h>
+
+#include "keepalive.h"
+#include "lib/vswitch-idl.h"
+#include "openvswitch/vlog.h"
+#include "timeval.h"
+
+VLOG_DEFINE_THIS_MODULE(keepalive);
+
+static bool keepalive_enable = false;    /* Keepalive disabled by default */
+static bool ka_init_status = ka_init_failure; /* Keepalive initialization */
+static uint32_t keepalive_timer_interval;     /* keepalive timer interval */
+static struct keepalive_info *ka_info = NULL;
+
+inline bool
+ka_is_enabled(void)
+{
+    return keepalive_enable;
+}
+
+inline int
+ka_get_pmd_tid(unsigned core_idx)
+{
+    if (ka_is_enabled()) {
+        return ka_info->thread_id[core_idx];
+    }
+
+    return -EINVAL;
+}
+
+void
+ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
+                    uint64_t last_alive)
+{
+    struct ka_process_info *pinfo;
+    int tid = ka_get_pmd_tid(core_id);
+
+    ovs_mutex_lock(&ka_info->proclist_mutex);
+    HMAP_FOR_EACH_WITH_HASH (pinfo, node, hash_int(tid, 0),
+                             &ka_info->process_list) {
+        if ((pinfo->core_id == core_id) && (pinfo->tid == tid)) {
+            pinfo->core_state = state;
+            pinfo->core_last_seen_times = last_alive;
+        }
+    }
+    ovs_mutex_unlock(&ka_info->proclist_mutex);
+}
+
+/* Retrieve and return the keepalive timer interval from OVSDB. */
+static uint32_t
+ka_get_timer_interval(const struct smap *ovs_other_config OVS_UNUSED)
+{
+#define OVS_KEEPALIVE_TIMEOUT 1000    /* Default timeout set to 1000ms */
+    uint32_t ka_interval;
+
+    /* Timer granularity in milliseconds
+     * Defaults to OVS_KEEPALIVE_TIMEOUT(ms) if not set */
+    ka_interval = smap_get_int(ovs_other_config, "keepalive-interval",
+                  OVS_KEEPALIVE_TIMEOUT);
+
+    VLOG_INFO("Keepalive timer interval set to %"PRIu32" (ms)\n", ka_interval);
+    return ka_interval;
+}
+
+/*
+ * This function shall be invoked periodically to write the core status and
+ * last seen timestamp of the cores in to keepalive info structure.
+ */
+void
+ka_update_core_state(const int core_id, const enum keepalive_state core_state,
+                     uint64_t last_alive, void *ptr_data OVS_UNUSED)
+{
+    switch (core_state) {
+    case KA_STATE_ALIVE:
+    case KA_STATE_MISSING:
+        ka_set_pmd_state_ts(core_id, KA_STATE_ALIVE, last_alive);
+        break;
+    case KA_STATE_UNUSED:
+    case KA_STATE_DOZING:
+    case KA_STATE_SLEEP:
+    case KA_STATE_DEAD:
+    case KA_STATE_GONE:
+        ka_set_pmd_state_ts(core_id, core_state, last_alive);
+        break;
+    }
+}
+
+static void
+keepalive_register_relay_cb(ka_relay_cb cb, void *aux)
+{
+    ka_info->relay_cb = cb;
+    ka_info->relay_cb_data = aux;
+}
+
+static struct keepalive_info *
+keepalive_info_create(void)
+{
+    struct keepalive_info *ka_info;
+    ka_info = xzalloc(sizeof *ka_info);
+    if (ka_info != NULL) {
+        ka_info->init_time = time_wall_msec();
+    }
+
+    return ka_info;
+}
+
+static int
+ka_init__(void)
+{
+    if ((ka_info = keepalive_info_create()) == NULL) {
+        VLOG_ERR("OvS Keepalive - initialization failed.");
+        return -1;
+    }
+
+    keepalive_register_relay_cb(ka_update_core_state, NULL);
+    ovs_mutex_init(&ka_info->proclist_mutex);
+    hmap_init(&ka_info->process_list);
+
+    return 0;
+}
+
+void
+ka_init(const struct smap *ovs_other_config)
+{
+    if (smap_get_bool(ovs_other_config, "enable-keepalive", false)) {
+        static struct ovsthread_once once_enable = OVSTHREAD_ONCE_INITIALIZER;
+
+        if (ovsthread_once_start(&once_enable)) {
+            keepalive_enable = true;
+            VLOG_INFO("OvS Keepalive enabled.");
+
+            keepalive_timer_interval =
+                ka_get_timer_interval(ovs_other_config);
+
+            int err = ka_init__();
+            if (!err) {
+                VLOG_INFO("OvS Keepalive - initialized.");
+                ka_init_status = ka_init_success;
+            }
+
+            ovsthread_once_done(&once_enable);
+        }
+    }
+}
+
+void
+ka_destroy(void)
+{
+    if (ka_info) {
+        struct ka_process_info *pinfo;
+
+        ovs_mutex_lock(&ka_info->proclist_mutex);
+        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
+            free(pinfo);
+        }
+        ovs_mutex_unlock(&ka_info->proclist_mutex);
+
+        hmap_destroy(&ka_info->process_list);
+        ovs_mutex_destroy(&ka_info->proclist_mutex);
+
+        free(ka_info);
+    }
+}
diff --git a/lib/keepalive.h b/lib/keepalive.h
new file mode 100644
index 0000000..d133398
--- /dev/null
+++ b/lib/keepalive.h
@@ -0,0 +1,87 @@ 
+/*
+ * Copyright (c) 2016 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KEEPALIVE_H
+#define KEEPALIVE_H
+
+#include <stdint.h>
+#include "openvswitch/hmap.h"
+#include "ovs-thread.h"
+
+#define KA_DP_MAXCORES 128
+
+struct smap;
+
+enum keepalive_state {
+    KA_STATE_UNUSED = 0,
+    KA_STATE_ALIVE = 1,
+    KA_STATE_MISSING = 4,
+    KA_STATE_DEAD = 2,
+    KA_STATE_GONE = 3,
+    KA_STATE_DOZING = 5,
+    KA_STATE_SLEEP = 6,
+};
+
+typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
+
+struct ka_process_info {
+    int tid;
+    int core_id;
+    enum keepalive_state core_state;
+    uint64_t core_last_seen_times;
+    struct hmap_node node;
+};
+
+struct keepalive_info {
+    /* Mutex for 'process_list'. */
+    struct ovs_mutex proclist_mutex;
+
+    /* List of process/threads monitored by KA framework. */
+    struct hmap process_list OVS_GUARDED;
+
+    /* Store Datapath threads 'tid'.
+     * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
+    pid_t thread_id[KA_DP_MAXCORES];
+
+    /* Datapath cores state. */
+    enum keepalive_state state_flags[KA_DP_MAXCORES];
+
+    uint64_t init_time;
+    /* Last seen timestamps. */
+    uint64_t last_alive[KA_DP_MAXCORES];
+
+    /* Active cores (non-zero if the core should be monitored). */
+    uint8_t active_cores[KA_DP_MAXCORES];
+
+    /** Relay handler. */
+    ka_relay_cb relay_cb;
+    void *relay_cb_data;
+};
+
+enum keepalive_status {
+    ka_init_failure = 0,
+    ka_init_success
+};
+
+void ka_init(const struct smap *);
+void ka_destroy(void);
+void ka_set_pmd_state_ts(unsigned, enum keepalive_state, uint64_t);
+void ka_update_core_state(const int, const enum keepalive_state,
+                          uint64_t, void *);
+
+int ka_get_pmd_tid(unsigned core);
+bool ka_is_enabled(void);
+#endif /* keepalive.h */
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index a8cbae7..aaa8d9b 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -34,6 +34,7 @@ 
 #include "hmapx.h"
 #include "if-notifier.h"
 #include "jsonrpc.h"
+#include "keepalive.h"
 #include "lacp.h"
 #include "mac-learning.h"
 #include "mcast-snooping.h"
@@ -506,6 +507,7 @@  bridge_exit(bool delete_datapath)
         bridge_destroy(br, delete_datapath);
     }
     ovsdb_idl_destroy(idl);
+    ka_destroy();
 }
 
 /* Looks at the list of managers in 'ovs_cfg' and extracts their remote IP
@@ -2953,6 +2955,7 @@  bridge_run(void)
     if (cfg) {
         netdev_set_flow_api_enabled(&cfg->other_config);
         dpdk_init(&cfg->other_config);
+        ka_init(&cfg->other_config);
     }
 
     /* Initialize the ofproto library.  This only needs to run once, but
diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema
index 90e50b6..c56a64c 100644
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@ 
 {"name": "Open_vSwitch",
- "version": "7.15.1",
- "cksum": "3682332033 23608",
+ "version": "7.16.0",
+ "cksum": "3631938350 23762",
  "tables": {
    "Open_vSwitch": {
      "columns": {
@@ -30,6 +30,10 @@ 
          "type": {"key": "string", "value": "string",
                   "min": 0, "max": "unlimited"},
          "ephemeral": true},
+       "keepalive": {
+         "type": {"key": "string", "value": "string", "min": 0,
+                  "max": "unlimited"},
+         "ephemeral": true},
        "ovs_version": {
          "type": {"key": {"type": "string"},
                   "min": 0, "max": 1}},
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 074535b..4302fd5 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -568,6 +568,55 @@ 
           </p>
         </column>
       </group>
+
+      <group title="Keepalive">
+        <p>
+          The <code>keepalive</code> column contains key-value pairs that
+          report health of datapath threads in Open vSwitch.  These are updated
+          periodically (based on the keepalive-interval).
+        </p>
+
+        <column name="other_config" key="enable-keepalive"
+                type='{"type": "boolean"}'>
+          Keepalive is disabled by default to avoid overhead in the common
+          case when heartbeat monitoring is not useful.  Set this value to
+          <code>true</code> to enable keepalive <ref column="keepalive"/>
+          column or to <code>false</code> to explicitly disable it.
+        </column>
+
+        <column name="other_config" key="keepalive-interval"
+                type='{"type": "integer", "minInteger": 1}'>
+          <p>
+            Specifies the keepalive interval value.
+          </p>
+          <p>
+            If not specified, this will be set to 1000 milliseconds (default
+            value). Changing this value requires restarting the daemon.
+          </p>
+        </column>
+
+        <column name="keepalive" key="PMD_ID">
+          <p>
+            One such key-value pair, with <code>ID</code> replaced by the
+            PMD thread, will exist for each active PMD thread.  The value is a
+            comma-separated list of PMD thread status, core number and the
+            last seen timestamp of PMD thread. In respective order, these
+            values are:
+          </p>
+
+          <ol>
+            <li>Status of PMD thread.  Valid values include ALIVE, MISSING,
+            DEAD, GONE, DOZING, SLEEPING.</li>
+            <li>Core id of PMD thread.</li>
+            <li>Last seen timestamp(epoch) of the PMD core.</li>
+          </ol>
+
+          <p>
+            This is only valid for OvS-DPDK Datapath and PMD threads status
+            is implemented currently.
+          </p>
+        </column>
+      </group>
     </group>
 
     <group title="Version Reporting">