diff mbox series

[ovs-dev,06/10] raft: Fix notifications when a server leaves the cluster.

Message ID 20181115060534.7146-6-blp@ovn.org
State Accepted
Headers show
Series [ovs-dev,01/10] ovsdb-idl: Avoid sending transactions when the DB is not synced up. | expand

Commit Message

Ben Pfaff Nov. 15, 2018, 6:05 a.m. UTC
When server A sends the leader a request to remove server B from the
cluster, where A != B, the leader sends both A and B a notification when
the removal is complete.  Until now, however, the notification (which is a
raft_remove_server_reply message) did not say which server had been
removed, and the receiver did not check.  Instead, the receiver assumed
that it had been removed.  The result was that B was removed and A stopped
serving out the database even though it was still part of the cluster,
This commit fixes the problem.

Reported-by: ramteja tadishetti <ramtejatadishetti@gmail.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
---
 ovsdb/raft-rpc.c |  5 +++++
 ovsdb/raft-rpc.h |  7 +++++++
 ovsdb/raft.c     | 50 ++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 50 insertions(+), 12 deletions(-)
diff mbox series

Patch

diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c
index 13aee0c4bac5..56c07d4879ba 100644
--- a/ovsdb/raft-rpc.c
+++ b/ovsdb/raft-rpc.c
@@ -460,6 +460,10 @@  static void
 raft_remove_server_reply_to_jsonrpc(const struct raft_remove_server_reply *rpy,
                                     struct json *args)
 {
+    if (!uuid_is_zero(&rpy->target_sid)) {
+        json_object_put_format(args, "target_server",
+                               UUID_FMT, UUID_ARGS(&rpy->target_sid));
+    }
     json_object_put(args, "success", json_boolean_create(rpy->success));
 }
 
@@ -468,6 +472,7 @@  raft_remove_server_reply_from_jsonrpc(struct ovsdb_parser *p,
                                       struct raft_remove_server_reply *rpy)
 {
     rpy->success = raft_parse_required_boolean(p, "success");
+    raft_parse_optional_uuid(p, "target_server", &rpy->target_sid);
 }
 
 static void
diff --git a/ovsdb/raft-rpc.h b/ovsdb/raft-rpc.h
index 15ddf012838c..bdc3429cc67c 100644
--- a/ovsdb/raft-rpc.h
+++ b/ovsdb/raft-rpc.h
@@ -205,6 +205,13 @@  struct raft_add_server_reply {
 struct raft_remove_server_reply {
     struct raft_rpc_common common;
     bool success;
+
+    /* SID of the removed server, but all-zeros if it is the same as the
+     * destination of the RPC.  (Older ovsdb-server did not have 'target_sid'
+     * and assumed that the destination was always the target, so by omitting
+     * 'target_sid' when this is the case we can preserve a small amount of
+     * inter-version compatibility.) */
+    struct uuid target_sid;
 };
 
 struct raft_install_snapshot_request {
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 07884820ed9b..753881586334 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -296,6 +296,7 @@  static void raft_send_remove_server_reply__(
     struct raft *, const struct uuid *target_sid,
     const struct uuid *requester_sid, struct unixctl_conn *requester_conn,
     bool success, const char *comment);
+static void raft_finished_leaving_cluster(struct raft *);
 
 static void raft_server_init_leader(struct raft *, struct raft_server *);
 
@@ -303,6 +304,7 @@  static bool raft_rpc_is_heartbeat(const union raft_rpc *);
 static bool raft_is_rpc_synced(const struct raft *, const union raft_rpc *);
 
 static void raft_handle_rpc(struct raft *, const union raft_rpc *);
+
 static bool raft_send_at(struct raft *, const union raft_rpc *,
                          int line_number);
 #define raft_send(raft, rpc) raft_send_at(raft, rpc, __LINE__)
@@ -2197,16 +2199,28 @@  raft_send_add_server_reply__(struct raft *raft, const struct uuid *sid,
 }
 
 static void
-raft_send_remove_server_reply_rpc(struct raft *raft, const struct uuid *sid,
+raft_send_remove_server_reply_rpc(struct raft *raft,
+                                  const struct uuid *dst_sid,
+                                  const struct uuid *target_sid,
                                   bool success, const char *comment)
 {
+    if (uuid_equals(&raft->sid, dst_sid)) {
+        if (success && uuid_equals(&raft->sid, target_sid)) {
+            raft_finished_leaving_cluster(raft);
+        }
+        return;
+    }
+
     const union raft_rpc rpy = {
         .remove_server_reply = {
             .common = {
                 .type = RAFT_RPC_REMOVE_SERVER_REPLY,
-                .sid = *sid,
+                .sid = *dst_sid,
                 .comment = CONST_CAST(char *, comment),
             },
+            .target_sid = (uuid_equals(dst_sid, target_sid)
+                           ? UUID_ZERO
+                           : *target_sid),
             .success = success,
         }
     };
@@ -2235,6 +2249,9 @@  raft_send_remove_server_reply__(struct raft *raft,
     } else {
         char buf[SID_LEN + 1];
         ds_put_cstr(&s, raft_get_nickname(raft, target_sid, buf, sizeof buf));
+        if (uuid_equals(target_sid, &raft->sid)) {
+            ds_put_cstr(&s, " (ourselves)");
+        }
     }
     ds_put_format(&s, " from cluster "CID_FMT" %s",
                   CID_ARGS(&raft->cid),
@@ -2251,11 +2268,12 @@  raft_send_remove_server_reply__(struct raft *raft,
      * allows it to be sure that it's really removed and update its log and
      * disconnect permanently.  */
     if (!uuid_is_zero(requester_sid)) {
-        raft_send_remove_server_reply_rpc(raft, requester_sid,
+        raft_send_remove_server_reply_rpc(raft, requester_sid, target_sid,
                                           success, comment);
     }
     if (!uuid_equals(requester_sid, target_sid)) {
-        raft_send_remove_server_reply_rpc(raft, target_sid, success, comment);
+        raft_send_remove_server_reply_rpc(raft, target_sid, target_sid,
+                                          success, comment);
     }
     if (requester_conn) {
         if (success) {
@@ -3559,17 +3577,25 @@  raft_handle_remove_server_request(struct raft *raft,
 }
 
 static void
-raft_handle_remove_server_reply(struct raft *raft,
-                                const struct raft_remove_server_reply *rpc)
+raft_finished_leaving_cluster(struct raft *raft)
 {
-    if (rpc->success) {
-        VLOG_INFO(SID_FMT": finished leaving cluster "CID_FMT,
-                  SID_ARGS(&raft->sid), CID_ARGS(&raft->cid));
+    VLOG_INFO(SID_FMT": finished leaving cluster "CID_FMT,
+              SID_ARGS(&raft->sid), CID_ARGS(&raft->cid));
 
-        raft_record_note(raft, "left", "this server left the cluster");
+    raft_record_note(raft, "left", "this server left the cluster");
+
+    raft->leaving = false;
+    raft->left = true;
+}
 
-        raft->leaving = false;
-        raft->left = true;
+static void
+raft_handle_remove_server_reply(struct raft *raft,
+                                const struct raft_remove_server_reply *rpc)
+{
+    if (rpc->success
+        && (uuid_is_zero(&rpc->target_sid)
+            || uuid_equals(&rpc->target_sid, &raft->sid))) {
+        raft_finished_leaving_cluster(raft);
     }
 }