@@ -386,6 +386,7 @@ static void raft_get_servers_from_log(struct raft *, enum vlog_level);
static void raft_get_election_timer_from_log(struct raft *);
static bool raft_handle_write_error(struct raft *, struct ovsdb_error *);
+static bool raft_has_uncommitted_configuration(const struct raft *);
static void raft_run_reconfigure(struct raft *);
@@ -2848,6 +2849,18 @@ raft_become_leader(struct raft *raft)
raft_reset_election_timer(raft);
raft_reset_ping_timer(raft);
+ if (raft->joining) {
+ /* It is possible that the server committing this one to the list of
+ * servers lost leadership before the entry is committed but after
+ * it was already replicated to majority of servers. In this case
+ * other servers will recognize this one as a valid cluster member
+ * and may transfer leadership to it and vote for it. This way
+ * we're becoming a cluster leader without receiving reply for a
+ * join request and will commit addition of this server ourselves. */
+ VLOG_INFO_RL(&rl, "elected as leader while joining");
+ raft->joining = false;
+ }
+
struct raft_server *s;
HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
raft_server_init_leader(raft, s);
@@ -3006,12 +3019,12 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
}
while (raft->commit_index < new_commit_index) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
uint64_t index = ++raft->commit_index;
const struct raft_entry *e = raft_get_entry(raft, index);
if (raft_entry_has_data(e)) {
struct raft_command *cmd = raft_find_command_by_eid(raft, &e->eid);
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
if (cmd) {
if (!cmd->index && raft->role == RAFT_LEADER) {
@@ -3055,6 +3068,35 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
* reallocate raft->entries, which would invalidate 'e', so
* this case must be last, after the one for 'e->data'. */
raft_run_reconfigure(raft);
+ } else if (e->servers && !raft_has_uncommitted_configuration(raft)) {
+ struct ovsdb_error *error;
+ struct raft_server *s;
+ struct hmap servers;
+
+ error = raft_servers_from_json(e->servers, &servers);
+ ovs_assert(!error);
+ HMAP_FOR_EACH (s, hmap_node, &servers) {
+ struct raft_server *server = raft_find_server(raft, &s->sid);
+
+ if (server && server->phase == RAFT_PHASE_COMMITTING) {
+ /* This server lost leadership while committing
+ * server 's', but it was committed later by a
+ * new leader. */
+ server->phase = RAFT_PHASE_STABLE;
+ }
+
+ if (raft->joining && uuid_equals(&s->sid, &raft->sid)) {
+ /* Leadership change happened before previous leader
+ * could commit the change of a servers list, but it
+ * was replicated and a new leader committed it. */
+ VLOG_INFO_RL(&rl,
+ "added to configuration without reply "
+ "(eid: "UUID_FMT", commit index: %"PRIu64")",
+ UUID_ARGS(&e->eid), index);
+ raft->joining = false;
+ }
+ }
+ raft_servers_destroy(&servers);
}
}
@@ -473,6 +473,59 @@ done
AT_CLEANUP
+AT_SETUP([OVSDB cluster - leadership change after replication while joining])
+AT_KEYWORDS([ovsdb server negative unix cluster join])
+
+n=5
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db dnl
+ $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=$(ovsdb-tool db-cid s1.db)
+schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema)
+for i in $(seq 2 $n); do
+ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill $(cat *.pid)'
+on_exit "
+ for i in \$(ls $(pwd)/s[[0-$n]]); do
+ ovs-appctl --timeout 1 -t \$i cluster/status $schema_name;
+ done
+"
+
+dnl Starting servers one by one asking all exisitng servers to transfer
+dnl leadership after append reply forcing the joining server to try another
+dnl one that will also transfer leadership. Since transfer is happening
+dnl after the servers update is replicated to other servers, one of the
+dnl other servers will actually commit it. It may be a new leader from
+dnl one of the old members or the new joining server itself.
+for i in $(seq $n); do
+ dnl Make sure that all already started servers joined the cluster.
+ for j in $(seq $((i - 1)) ); do
+ AT_CHECK([ovsdb_client_wait unix:s$j.ovsdb $schema_name connected])
+ done
+ for j in $(seq $((i - 1)) ); do
+ OVS_WAIT_UNTIL([ovs-appctl -t "$(pwd)"/s$j \
+ cluster/failure-test \
+ transfer-leadership-after-sending-append-request \
+ | grep -q "engaged"])
+ done
+
+ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off \
+ --detach --no-chdir --log-file=s$i.log \
+ --pidfile=s$i.pid --unixctl=s$i \
+ --remote=punix:s$i.ovsdb s$i.db])
+done
+
+dnl Make sure that all servers joined the cluster.
+for i in $(seq $n); do
+ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+for i in $(seq $n); do
+ OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
+done
+
+AT_CLEANUP
OVS_START_SHELL_HELPERS