diff mbox series

[ovs-dev,v2] northd: Update the probe interval in main loop.

Message ID 20210921180859.16066-1-zhewang@nvidia.com
State Accepted
Headers show
Series [ovs-dev,v2] northd: Update the probe interval in main loop. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Zhen Wang Sept. 21, 2021, 6:08 p.m. UTC
From: zhen wang <zhewang@nvidia.com>

When ovn-northd work in HA mode, ovn-northd will not update the
probe interval in standby mode. If SB/NB raft leader and active
ovn-northd instance got killed by system power outage, standby
ovn-northd instance would never detect the failure.
This patch address the problem by updating the probe value in main loop.

Signed-off-by: zhen wang <zhewang@nvidia.com>
---
 northd/northd.c     | 25 -------------------------
 northd/ovn-northd.c | 30 ++++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 25 deletions(-)

Comments

Han Zhou Sept. 21, 2021, 10:48 p.m. UTC | #1
On Tue, Sep 21, 2021 at 11:08 AM Zhen Wang <zhewang@nvidia.com> wrote:
>
> From: zhen wang <zhewang@nvidia.com>
>
> When ovn-northd work in HA mode, ovn-northd will not update the
> probe interval in standby mode. If SB/NB raft leader and active
> ovn-northd instance got killed by system power outage, standby
> ovn-northd instance would never detect the failure.
> This patch address the problem by updating the probe value in main loop.
>
> Signed-off-by: zhen wang <zhewang@nvidia.com>

Thanks Zhen. I applied this fix to master, branch-21.09, 21.06 and 21.03.

For master and branch-21.09, I made a minor adjustment to the commit
message:

    When ovn-northd work in HA mode, ovn-northd will not update the
    probe interval in standby mode. This patch address the problem by
    updating the probe value in main loop.

I removed the sentence that describes the HA impact because on branch-21.09
and master after the commit 520d5ceda3 that split northd.c, the behavior
changed. Although standby still won't get the probe interval updated, the
impact is different because the probe interval for standby won't be 0 (but
instead it will be the default 5s). The HA impact is valid for older
branches so I kept the message as is for them. Regardless of the commit
message, the actual fixes are essentially the same for all branches.

Thanks,
Han
diff mbox series

Patch

diff --git a/northd/northd.c b/northd/northd.c
index 621e83175..91635b93b 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -73,10 +73,6 @@  static struct eth_addr svc_monitor_mac_ea;
  * Otherwise, it will avoid using it.  The default is true. */
 static bool use_ct_inv_match = true;
 
-/* Default probe interval for NB and SB DB connections. */
-#define DEFAULT_PROBE_INTERVAL_MSEC 5000
-static int northd_probe_interval_nb = 0;
-static int northd_probe_interval_sb = 0;
 #define MAX_OVN_TAGS 4096
 
 /* Pipeline stages. */
@@ -14190,20 +14186,6 @@  build_meter_groups(struct northd_context *ctx,
     }
 }
 
-static int
-get_probe_interval(const char *db, const struct nbrec_nb_global *nb)
-{
-    int default_interval = (db && !stream_or_pstream_needs_probes(db)
-                            ? 0 : DEFAULT_PROBE_INTERVAL_MSEC);
-    int interval = smap_get_int(&nb->options,
-                                "northd_probe_interval", default_interval);
-
-    if (interval > 0 && interval < 1000) {
-        interval = 1000;
-    }
-    return interval;
-}
-
 static void
 ovnnb_db_run(struct northd_context *ctx,
              struct ovsdb_idl_index *sbrec_chassis_by_name,
@@ -14290,13 +14272,6 @@  ovnnb_db_run(struct northd_context *ctx,
 
     smap_destroy(&options);
 
-    /* Update the probe interval. */
-    northd_probe_interval_nb = get_probe_interval(ctx->ovnnb_db, nb);
-    northd_probe_interval_sb = get_probe_interval(ctx->ovnsb_db, nb);
-
-    ovsdb_idl_set_probe_interval(ctx->ovnnb_idl, northd_probe_interval_nb);
-    ovsdb_idl_set_probe_interval(ctx->ovnsb_idl, northd_probe_interval_sb);
-
     use_parallel_build =
         (smap_get_bool(&nb->options, "use_parallel_build", false) &&
          can_parallelize_hashes(false));
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 42c0ad644..39aa96055 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -65,6 +65,10 @@  static const char *ssl_private_key_file;
 static const char *ssl_certificate_file;
 static const char *ssl_ca_cert_file;
 
+/* Default probe interval for NB and SB DB connections. */
+#define DEFAULT_PROBE_INTERVAL_MSEC 5000
+static int northd_probe_interval_nb = 0;
+static int northd_probe_interval_sb = 0;
 static bool use_parallel_build = true;
 
 static const char *rbac_chassis_auth[] =
@@ -576,6 +580,20 @@  update_ssl_config(void)
     }
 }
 
+static int
+get_probe_interval(const char *db, const struct nbrec_nb_global *nb)
+{
+    int default_interval = (db && !stream_or_pstream_needs_probes(db)
+                            ? 0 : DEFAULT_PROBE_INTERVAL_MSEC);
+    int interval = smap_get_int(&nb->options,
+                                "northd_probe_interval", default_interval);
+
+    if (interval > 0 && interval < 1000) {
+        interval = 1000;
+    }
+    return interval;
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -997,6 +1015,18 @@  main(int argc, char *argv[])
             poll_immediate_wake();
         }
 
+        const struct nbrec_nb_global *nb =
+            nbrec_nb_global_first(ovnnb_idl_loop.idl);
+        /* Update the probe interval. */
+        if (nb) {
+            northd_probe_interval_nb = get_probe_interval(ovnnb_db, nb);
+            northd_probe_interval_sb = get_probe_interval(ovnsb_db, nb);
+        }
+        ovsdb_idl_set_probe_interval(ovnnb_idl_loop.idl,
+                                     northd_probe_interval_nb);
+        ovsdb_idl_set_probe_interval(ovnsb_idl_loop.idl,
+                                     northd_probe_interval_sb);
+
         if (reset_ovnsb_idl_min_index) {
             VLOG_INFO("Resetting southbound database cluster state");
             ovsdb_idl_reset_min_index(ovnsb_idl_loop.idl);