[ovs-dev,06/13] reconnect: Add ability to do a number of retries without backoff.

Message ID 20171007004458.5788-7-blp@ovn.org
State New
Headers show
Series
  • clustering implementation, part 1
Related show

Commit Message

Ben Pfaff Oct. 7, 2017, 12:44 a.m.
This is aimed at an upcoming database clustering implementation, where it's
desirable to try all of the cluster members quickly before backing off to
retry them again in sequence.

Signed-off-by: Ben Pfaff <blp@ovn.org>
---
 lib/reconnect.c         | 52 ++++++++++++++++++++++++++++++---------------
 lib/reconnect.h         |  3 +++
 python/ovs/reconnect.py | 53 ++++++++++++++++++++++++++++++----------------
 tests/reconnect.at      | 56 ++++++++++++++++++++++++++++++++++++++++++++++++-
 tests/test-reconnect.c  |  8 +++++++
 tests/test-reconnect.py |  5 +++++
 6 files changed, 141 insertions(+), 36 deletions(-)

Comments

Russell Bryant Oct. 9, 2017, 7:47 p.m. | #1
On Fri, Oct 6, 2017 at 8:44 PM, Ben Pfaff <blp@ovn.org> wrote:
> This is aimed at an upcoming database clustering implementation, where it's
> desirable to try all of the cluster members quickly before backing off to
> retry them again in sequence.
>
> Signed-off-by: Ben Pfaff <blp@ovn.org>

Acked-by: Russell Bryant <russell@ovn.org>

Patch

diff --git a/lib/reconnect.c b/lib/reconnect.c
index 471fb7fc8d61..f91b4c09ae5d 100644
--- a/lib/reconnect.c
+++ b/lib/reconnect.c
@@ -62,6 +62,7 @@  struct reconnect {
     long long int last_connected;
     long long int last_disconnected;
     unsigned int max_tries;
+    unsigned int backoff_free_tries;
 
     /* These values are simply for statistics reporting, not otherwise used
      * directly by anything internal. */
@@ -206,6 +207,15 @@  reconnect_get_max_tries(struct reconnect *fsm)
     return fsm->max_tries;
 }
 
+/* Sets the number of connection attempts that will be made without backoff to
+ * 'backoff_free_tries'.  Values 0 and 1 both represent a single attempt. */
+void
+reconnect_set_backoff_free_tries(struct reconnect *fsm,
+                                 unsigned int backoff_free_tries)
+{
+    fsm->backoff_free_tries = backoff_free_tries;
+}
+
 /* Configures the backoff parameters for 'fsm'.  'min_backoff' is the minimum
  * number of milliseconds, and 'max_backoff' is the maximum, between connection
  * attempts.  The current backoff is also the duration that 'fsm' is willing to
@@ -346,7 +356,7 @@  reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
                 VLOG(fsm->info, "%s: error listening for connections",
                      fsm->name);
             }
-        } else {
+        } else if (fsm->backoff < fsm->max_backoff) {
             const char *type = fsm->passive ? "listen" : "connection";
             if (error > 0) {
                 VLOG_INFO("%s: %s attempt failed (%s)",
@@ -359,30 +369,38 @@  reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
         if (fsm->state & (S_ACTIVE | S_IDLE)) {
             fsm->last_disconnected = now;
         }
+
+        if (!reconnect_may_retry(fsm)) {
+            reconnect_transition__(fsm, now, S_VOID);
+            return;
+        }
+
         /* Back off. */
-        if (fsm->state & (S_ACTIVE | S_IDLE)
-             && (fsm->last_activity - fsm->last_connected >= fsm->backoff
-                 || fsm->passive)) {
+        if (fsm->backoff_free_tries > 1) {
+            fsm->backoff_free_tries--;
+            fsm->backoff = 0;
+        } else if (fsm->state & (S_ACTIVE | S_IDLE)
+                   && (fsm->last_activity - fsm->last_connected >= fsm->backoff
+                       || fsm->passive)) {
             fsm->backoff = fsm->passive ? 0 : fsm->min_backoff;
         } else {
             if (fsm->backoff < fsm->min_backoff) {
                 fsm->backoff = fsm->min_backoff;
-            } else if (fsm->backoff >= fsm->max_backoff / 2) {
-                fsm->backoff = fsm->max_backoff;
-            } else {
+            } else if (fsm->backoff < fsm->max_backoff / 2) {
                 fsm->backoff *= 2;
-            }
-            if (fsm->passive) {
-                VLOG(fsm->info, "%s: waiting %.3g seconds before trying to "
-                          "listen again", fsm->name, fsm->backoff / 1000.0);
+                VLOG(fsm->info, "%s: waiting %.3g seconds before %s",
+                     fsm->name, fsm->backoff / 1000.0,
+                     fsm->passive ? "trying to listen again" : "reconnect");
             } else {
-                VLOG(fsm->info, "%s: waiting %.3g seconds before reconnect",
-                          fsm->name, fsm->backoff / 1000.0);
+                if (fsm->backoff < fsm->max_backoff) {
+                    VLOG_INFO("%s: continuing to %s in the background but "
+                              "suppressing further logging", fsm->name,
+                              fsm->passive ? "try to listen" : "reconnect");
+                }
+                fsm->backoff = fsm->max_backoff;
             }
         }
-
-        reconnect_transition__(fsm, now,
-                               reconnect_may_retry(fsm) ? S_BACKOFF : S_VOID);
+        reconnect_transition__(fsm, now, S_BACKOFF);
     }
 }
 
@@ -397,7 +415,7 @@  reconnect_connecting(struct reconnect *fsm, long long int now)
     if (fsm->state != S_CONNECTING) {
         if (fsm->passive) {
             VLOG(fsm->info, "%s: listening...", fsm->name);
-        } else {
+        } else if (fsm->backoff < fsm->max_backoff) {
             VLOG(fsm->info, "%s: connecting...", fsm->name);
         }
         reconnect_transition__(fsm, now, S_CONNECTING);
diff --git a/lib/reconnect.h b/lib/reconnect.h
index 4446713ce873..9f2d469e2ddd 100644
--- a/lib/reconnect.h
+++ b/lib/reconnect.h
@@ -51,6 +51,8 @@  int reconnect_get_probe_interval(const struct reconnect *);
 
 void reconnect_set_max_tries(struct reconnect *, unsigned int max_tries);
 unsigned int reconnect_get_max_tries(struct reconnect *);
+void reconnect_set_backoff_free_tries(struct reconnect *,
+                                      unsigned int backoff_free_tries);
 
 void reconnect_set_backoff(struct reconnect *,
                            int min_backoff, int max_backoff);
@@ -65,6 +67,7 @@  void reconnect_enable(struct reconnect *, long long int now);
 void reconnect_disable(struct reconnect *, long long int now);
 
 void reconnect_force_reconnect(struct reconnect *, long long int now);
+void reconnect_skip_backoff(struct reconnect *);
 
 bool reconnect_is_connected(const struct reconnect *);
 unsigned int reconnect_get_last_connect_elapsed(const struct reconnect *,
diff --git a/python/ovs/reconnect.py b/python/ovs/reconnect.py
index ec52ebb7affc..34cc76987031 100644
--- a/python/ovs/reconnect.py
+++ b/python/ovs/reconnect.py
@@ -154,6 +154,7 @@  class Reconnect(object):
         self.last_connected = None
         self.last_disconnected = None
         self.max_tries = None
+        self.backoff_free_tries = 0
 
         self.creation_time = now
         self.n_attempted_connections = 0
@@ -242,6 +243,12 @@  class Reconnect(object):
             self.backoff > self.max_backoff):
                 self.backoff = self.max_backoff
 
+    def set_backoff_free_tries(self, backoff_free_tries):
+        """Sets the number of connection attempts that will be made without
+        backoff to 'backoff_free_tries'.  Values 0 and 1 both
+        represent a single attempt."""
+        self.backoff_free_tries = backoff_free_tries
+
     def set_probe_interval(self, probe_interval):
         """Sets the "probe interval" to 'probe_interval', in milliseconds.  If
         this is zero, it disables the connection keepalive feature.  If it is
@@ -337,7 +344,7 @@  class Reconnect(object):
                 else:
                     self.info_level("%s: error listening for connections"
                                     % self.name)
-            else:
+            elif self.backoff < self.max_backoff:
                 if self.passive:
                     type_ = "listen"
                 else:
@@ -352,8 +359,15 @@  class Reconnect(object):
             if (self.state in (Reconnect.Active, Reconnect.Idle)):
                 self.last_disconnected = now
 
+            if not self.__may_retry():
+                self._transition(now, Reconnect.Void)
+                return
+
             # Back off
-            if (self.state in (Reconnect.Active, Reconnect.Idle) and
+            if self.backoff_free_tries > 1:
+                self.backoff_free_tries -= 1
+                self.backoff = 0
+            elif (self.state in (Reconnect.Active, Reconnect.Idle) and
                 (self.last_activity - self.last_connected >= self.backoff or
                  self.passive)):
                 if self.passive:
@@ -363,23 +377,26 @@  class Reconnect(object):
             else:
                 if self.backoff < self.min_backoff:
                     self.backoff = self.min_backoff
-                elif self.backoff >= self.max_backoff / 2:
-                    self.backoff = self.max_backoff
-                else:
+                elif self.backoff < self.max_backoff / 2:
                     self.backoff *= 2
-
-                if self.passive:
-                    self.info_level("%s: waiting %.3g seconds before trying "
-                                    "to listen again"
-                                    % (self.name, self.backoff / 1000.0))
+                    if self.passive:
+                        action = "trying to listen again"
+                    else:
+                        action = "reconnect"
+                    self.info_level("%s: waiting %.3g seconds before %s"
+                                    % (self.name, self.backoff / 1000.0,
+                                       action))
                 else:
-                    self.info_level("%s: waiting %.3g seconds before reconnect"
-                                    % (self.name, self.backoff / 1000.0))
-
-            if self.__may_retry():
-                self._transition(now, Reconnect.Backoff)
-            else:
-                self._transition(now, Reconnect.Void)
+                    if self.backoff < self.max_backoff:
+                        if self.passive:
+                            action = "try to listen"
+                        else:
+                            action = "reconnect"
+                        self.info_level("%s: continuing to %s in the "
+                                        "background but suppressing further "
+                                        "logging" % (self.name, action))
+                    self.backoff = self.max_backoff
+            self._transition(now, Reconnect.Backoff)
 
     def connecting(self, now):
         """Tell this FSM that a connection or listening attempt is in progress.
@@ -390,7 +407,7 @@  class Reconnect(object):
         if self.state != Reconnect.ConnectInProgress:
             if self.passive:
                 self.info_level("%s: listening..." % self.name)
-            else:
+            elif self.backoff < self.max_backoff:
                 self.info_level("%s: connecting..." % self.name)
             self._transition(now, Reconnect.ConnectInProgress)
 
diff --git a/tests/reconnect.at b/tests/reconnect.at
index c88ca785cad2..59c95d95bdd3 100644
--- a/tests/reconnect.at
+++ b/tests/reconnect.at
@@ -1037,6 +1037,60 @@  timeout
 ])
 
 ######################################################################
+RECONNECT_CHECK([backoff-free tries work],
+  [set-backoff-free-tries 2
+enable
+
+# Connection fails quickly.
+run
+connect-failed ECONNREFUSED
+
+# No backoff.
+run
+timeout
+
+# Connection fails quickly again.
+run
+connect-failed ECONNREFUSED
+
+# Back off for 1000 ms.
+run
+timeout
+],
+   [### t=1000 ###
+set-backoff-free-tries 2
+enable
+  in BACKOFF for 0 ms (0 ms backoff)
+
+# Connection fails quickly.
+run
+  should connect
+connect-failed ECONNREFUSED
+  0 successful connections out of 1 attempts, seqno 0
+
+# No backoff.
+run
+  should connect
+timeout
+  advance 0 ms
+
+# Connection fails quickly again.
+run
+  should connect
+connect-failed ECONNREFUSED
+  in BACKOFF for 0 ms (1000 ms backoff)
+  0 successful connections out of 2 attempts, seqno 0
+
+# Back off for 1000 ms.
+run
+timeout
+  advance 1000 ms
+
+### t=2000 ###
+  in BACKOFF for 1000 ms (1000 ms backoff)
+])
+
+######################################################################
 RECONNECT_CHECK([max-tries of 1 honored],
   [set-max-tries 1
 enable
@@ -1090,7 +1144,7 @@  timeout
 run
   should disconnect
 disconnected
-  in VOID for 0 ms (1000 ms backoff)
+  in VOID for 0 ms (0 ms backoff)
   1 successful connections out of 1 attempts, seqno 2
   disconnected
   disconnected at 11000 ms (0 ms ago)
diff --git a/tests/test-reconnect.c b/tests/test-reconnect.c
index 72252b8f707b..5a14e7fe58da 100644
--- a/tests/test-reconnect.c
+++ b/tests/test-reconnect.c
@@ -208,6 +208,12 @@  do_set_max_tries(struct ovs_cmdl_context *ctx)
 }
 
 static void
+do_set_backoff_free_tries(struct ovs_cmdl_context *ctx)
+{
+    reconnect_set_backoff_free_tries(reconnect, atoi(ctx->argv[1]));
+}
+
+static void
 diff_stats(const struct reconnect_stats *old,
            const struct reconnect_stats *new,
            int delta)
@@ -284,6 +290,8 @@  static const struct ovs_cmdl_command all_commands[] = {
     { "advance", NULL, 1, 1, do_advance, OVS_RO },
     { "timeout", NULL, 0, 0, do_timeout, OVS_RO },
     { "set-max-tries", NULL, 1, 1, do_set_max_tries, OVS_RO },
+    { "set-backoff-free-tries", NULL, 1, 1, do_set_backoff_free_tries,
+      OVS_RO },
     { "passive", NULL, 0, 0, do_set_passive, OVS_RO },
     { "listening", NULL, 0, 0, do_listening, OVS_RO },
     { "listen-error", NULL, 1, 1, do_listen_error, OVS_RO },
diff --git a/tests/test-reconnect.py b/tests/test-reconnect.py
index 8132fd9258ef..6cd052878eb1 100644
--- a/tests/test-reconnect.py
+++ b/tests/test-reconnect.py
@@ -104,6 +104,10 @@  def do_set_max_tries(arg):
     r.set_max_tries(int(arg))
 
 
+def do_set_backoff_free_tries(arg):
+    r.set_backoff_free_tries(int(arg))
+
+
 def diff_stats(old, new, delta):
     if (old.state != new.state or
         old.state_elapsed != new.state_elapsed or
@@ -173,6 +177,7 @@  def main():
         "advance": do_advance,
         "timeout": do_timeout,
         "set-max-tries": do_set_max_tries,
+        "set-backoff-free-tries": do_set_backoff_free_tries,
         "passive": do_set_passive,
         "listening": do_listening,
         "listen-error": do_listen_error