Message ID | 1565713402-5458-3-git-send-email-hzhou8@ebay.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,1/4] raft.c: Move raft_reset_ping_timer() out of the loop. | expand |
On Tue, Aug 13, 2019 at 9:23 AM Han Zhou <zhouhan@gmail.com> wrote: > > From: Han Zhou <hzhou8@ebay.com> > > candiate_retrying is used to determine if the current node is disconnected > from the cluster when the node is in candiate role. However, a node > can flap between candidate and follower role before a leader is elected > when majority of the cluster is down, so is_connected() will flap, too, which > confuses clients. > > This patch avoids the flapping with the help of a new member had_leader, > so that if no leader was elected since last election, we know we are > still retrying, and keep as disconnected from the cluster. > > Signed-off-by: Han Zhou <hzhou8@ebay.com> > --- > ovsdb/raft.c | 25 ++++++++++++++++++++----- > 1 file changed, 20 insertions(+), 5 deletions(-) > > diff --git a/ovsdb/raft.c b/ovsdb/raft.c > index 1c38b3b..63c3081 100644 > --- a/ovsdb/raft.c > +++ b/ovsdb/raft.c > @@ -286,8 +286,11 @@ struct raft { > > /* Candidates only. Reinitialized at start of election. */ > int n_votes; /* Number of votes for me. */ > - bool candidate_retrying; /* The first round of election timed-out and it > - is now retrying. */ > + bool candidate_retrying; /* The earlier election timed-out and we are > + now retrying. */ > + bool had_leader; /* There has been leader elected since last > + election initiated. This is to help setting > + candidate_retrying. */ > }; > > /* All Raft structures. */ > @@ -345,6 +348,7 @@ static bool raft_handle_write_error(struct raft *, struct ovsdb_error *); > > static void raft_run_reconfigure(struct raft *); > > +static void raft_set_leader(struct raft *, const struct uuid *sid); > static struct raft_server * > raft_find_server(const struct raft *raft, const struct uuid *sid) > { > @@ -1616,8 +1620,11 @@ raft_start_election(struct raft *raft, bool leadership_transfer) > } > > ovs_assert(raft->role != RAFT_LEADER); > - raft->candidate_retrying = (raft->role == RAFT_CANDIDATE); > raft->role = RAFT_CANDIDATE; > + /* If there was no leader elected since last election, we know we are > + * retrying now. */ > + raft->candidate_retrying = !raft->had_leader; > + raft->had_leader = false; > > raft->n_votes = 0; > > @@ -2450,6 +2457,14 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s) > } > > static void > +raft_set_leader(struct raft *raft, const struct uuid *sid) > +{ > + raft->leader_sid = *sid; > + raft->had_leader = true; > + raft->candidate_retrying = false; > +} > + > +static void > raft_become_leader(struct raft *raft) > { > log_all_commands(raft); > @@ -2461,7 +2476,7 @@ raft_become_leader(struct raft *raft) > > ovs_assert(raft->role != RAFT_LEADER); > raft->role = RAFT_LEADER; > - raft->leader_sid = raft->sid; > + raft_set_leader(raft, &raft->sid); > raft->election_timeout = LLONG_MAX; > raft_reset_ping_timer(raft); > > @@ -2855,7 +2870,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid) > raft_get_nickname(raft, sid, buf, sizeof buf), > raft->term); > } > - raft->leader_sid = *sid; > + raft_set_leader(raft, sid); > > /* Record the leader to the log. This is not used by the algorithm > * (although it could be, for quick restart), but it is used for > -- > 2.1.0 > Sorry that I forgot to update the condition in raft_is_connected(). I fixed it and added a test case in v2: https://patchwork.ozlabs.org/patch/1147277/
diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 1c38b3b..63c3081 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -286,8 +286,11 @@ struct raft { /* Candidates only. Reinitialized at start of election. */ int n_votes; /* Number of votes for me. */ - bool candidate_retrying; /* The first round of election timed-out and it - is now retrying. */ + bool candidate_retrying; /* The earlier election timed-out and we are + now retrying. */ + bool had_leader; /* There has been leader elected since last + election initiated. This is to help setting + candidate_retrying. */ }; /* All Raft structures. */ @@ -345,6 +348,7 @@ static bool raft_handle_write_error(struct raft *, struct ovsdb_error *); static void raft_run_reconfigure(struct raft *); +static void raft_set_leader(struct raft *, const struct uuid *sid); static struct raft_server * raft_find_server(const struct raft *raft, const struct uuid *sid) { @@ -1616,8 +1620,11 @@ raft_start_election(struct raft *raft, bool leadership_transfer) } ovs_assert(raft->role != RAFT_LEADER); - raft->candidate_retrying = (raft->role == RAFT_CANDIDATE); raft->role = RAFT_CANDIDATE; + /* If there was no leader elected since last election, we know we are + * retrying now. */ + raft->candidate_retrying = !raft->had_leader; + raft->had_leader = false; raft->n_votes = 0; @@ -2450,6 +2457,14 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s) } static void +raft_set_leader(struct raft *raft, const struct uuid *sid) +{ + raft->leader_sid = *sid; + raft->had_leader = true; + raft->candidate_retrying = false; +} + +static void raft_become_leader(struct raft *raft) { log_all_commands(raft); @@ -2461,7 +2476,7 @@ raft_become_leader(struct raft *raft) ovs_assert(raft->role != RAFT_LEADER); raft->role = RAFT_LEADER; - raft->leader_sid = raft->sid; + raft_set_leader(raft, &raft->sid); raft->election_timeout = LLONG_MAX; raft_reset_ping_timer(raft); @@ -2855,7 +2870,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid) raft_get_nickname(raft, sid, buf, sizeof buf), raft->term); } - raft->leader_sid = *sid; + raft_set_leader(raft, sid); /* Record the leader to the log. This is not used by the algorithm * (although it could be, for quick restart), but it is used for