Message ID | 1565713402-5458-4-git-send-email-hzhou8@ebay.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,1/4] raft.c: Move raft_reset_ping_timer() out of the loop. | expand |
On Tue, Aug 13, 2019 at 9:23 AM Han Zhou <zhouhan@gmail.com> wrote: > > From: Han Zhou <hzhou8@ebay.com> > > As mentioned in RAFT paper, section 6.2: > > Leaders: A server might be in the leader state, but if it isn’t the current > leader, it could be needlessly delaying client requests. For example, suppose a > leader is partitioned from the rest of the cluster, but it can still > communicate with a particular client. Without additional mechanism, it could > delay a request from that client forever, being unable to replicate a log entry > to any other servers. Meanwhile, there might be another leader of a newer term > that is able to communicate with a majority of the cluster and would be able to > commit the client’s request. Thus, a leader in Raft steps down if an election > timeout elapses without a successful round of heartbeats to a majority of its > cluster; this allows clients to retry their requests with another server. > > Signed-off-by: Han Zhou <hzhou8@ebay.com> > --- > ovsdb/raft-private.h | 3 ++ > ovsdb/raft.c | 42 ++++++++++++++++- > tests/ovsdb-cluster.at | 123 +++++++++++++++++++++++++++++-------------------- > 3 files changed, 116 insertions(+), 52 deletions(-) > > diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h > index 35a3dd7..fb7e6e3 100644 > --- a/ovsdb/raft-private.h > +++ b/ovsdb/raft-private.h > @@ -80,6 +80,9 @@ struct raft_server { > uint64_t next_index; /* Index of next log entry to send this server. */ > uint64_t match_index; /* Index of max log entry server known to have. */ > enum raft_server_phase phase; > + bool replied; /* Reply to append_request was received from this > + node during current election_timeout interval. > + */ > /* For use in adding and removing servers: */ > struct uuid requester_sid; /* Nonzero if requested via RPC. */ > struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */ > diff --git a/ovsdb/raft.c b/ovsdb/raft.c > index 63c3081..9516a6f 100644 > --- a/ovsdb/raft.c > +++ b/ovsdb/raft.c > @@ -1792,7 +1792,43 @@ raft_run(struct raft *raft) > } > > if (!raft->joining && time_msec() >= raft->election_timeout) { > - raft_start_election(raft, false); > + if (raft->role == RAFT_LEADER) { > + /* Check if majority of followers replied, then reset > + * election_timeout and reset s->replied. Otherwise, become > + * follower. > + * > + * Raft paper section 6.2: Leaders: A server might be in the leader > + * state, but if it isn’t the current leader, it could be > + * needlessly delaying client requests. For example, suppose a > + * leader is partitioned from the rest of the cluster, but it can > + * still communicate with a particular client. Without additional > + * mechanism, it could delay a request from that client forever, > + * being unable to replicate a log entry to any other servers. > + * Meanwhile, there might be another leader of a newer term that is > + * able to communicate with a majority of the cluster and would be > + * able to commit the client’s request. Thus, a leader in Raft > + * steps down if an election timeout elapses without a successful > + * round of heartbeats to a majority of its cluster; this allows > + * clients to retry their requests with another server. */ > + int count = 0; > + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { > + if (server->replied) { > + count ++; > + } > + } > + if (count >= hmap_count(&raft->servers) / 2) { > + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { > + server->replied = false; > + } > + raft_reset_election_timer(raft); > + } else { > + raft_become_follower(raft); > + raft_start_election(raft, false); > + } > + } else { > + raft_start_election(raft, false); > + } > + > } > > if (raft->leaving && time_msec() >= raft->leave_timeout) { > @@ -2454,6 +2490,7 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s) > s->next_index = raft->log_end; > s->match_index = 0; > s->phase = RAFT_PHASE_STABLE; > + s->replied = false; > } > > static void > @@ -2477,7 +2514,7 @@ raft_become_leader(struct raft *raft) > ovs_assert(raft->role != RAFT_LEADER); > raft->role = RAFT_LEADER; > raft_set_leader(raft, &raft->sid); > - raft->election_timeout = LLONG_MAX; > + raft_reset_election_timer(raft); > raft_reset_ping_timer(raft); > > struct raft_server *s; > @@ -3207,6 +3244,7 @@ raft_handle_append_reply(struct raft *raft, > } > } > > + s->replied = true; > if (rpy->result == RAFT_APPEND_OK) { > /* Figure 3.1: "If successful, update nextIndex and matchIndex for > * follower (section 3.5)." */ > diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at > index 6a13843..7146fe6 100644 > --- a/tests/ovsdb-cluster.at > +++ b/tests/ovsdb-cluster.at > @@ -65,59 +65,82 @@ EXECUTION_EXAMPLES > > AT_BANNER([OVSDB - disconnect from cluster]) > > -# When a node is disconnected from the cluster, the IDL should disconnect > -# and retry even if it uses a single remote, because the remote IP can be > -# a VIP on a load-balance. > -AT_SETUP([OVSDB cluster - disconnect from cluster, single remote]) > -AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) > +OVS_START_SHELL_HELPERS > +# ovsdb_test_cluster_disconnect LEADER_OR_FOLLOWER > +ovsdb_test_cluster_disconnect () { > + leader_or_follower=$1 > + schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > + ordinal_schema > schema > + AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) > + cid=`ovsdb-tool db-cid s1.db` > + schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > + for i in `seq 2 3`; do > + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) > + done > + > + on_exit 'kill `cat *.pid`' > + for i in `seq 3`; do > + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) > + done > + for i in `seq 3`; do > + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) > + done > + > + AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", > + {"op": "insert", > + "table": "simple", > + "row": {"i": 1}}]]'], [0], [ignore], [ignore]) > + > + # When a node is disconnected from the cluster, the IDL should disconnect > + # and retry even if it uses a single remote, because the remote IP can be > + # a VIP on a load-balance. So we use single remote to test here. > + if test $leader_or_follower == "leader"; then > + target=1 > + shutdown="2 3" > + else > + target=3 > + > + # shutdown follower before the leader so that there is no chance for s3 > + # become leader during the process. > + shutdown="2 1" > + fi > + > + # Connect to $target. Use "wait" to trigger a non-op transaction so > + # that test-ovsdb will not quit. > + > + on_exit 'pkill test-ovsdb' > + test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s$target.ovsdb '[["idltest", > + {"op": "wait", > + "table": "simple", > + "where": [["i", "==", 1]], > + "columns": ["i"], > + "until": "==", > + "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 & > > -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > -ordinal_schema > schema > -AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) > -cid=`ovsdb-tool db-cid s1.db` > -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` > -for i in `seq 2 3`; do > - AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) > -done > - > -on_exit 'kill `cat *.pid`' > -for i in `seq 3`; do > - AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) > -done > -for i in `seq 3`; do > - AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) > -done > - > -AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", > - {"op": "insert", > - "table": "simple", > - "row": {"i": 1}}]]'], [0], [ignore], [ignore]) > - > -# Connect to a single remote s3. Use "wait" to trigger a non-op transaction so > -# that test-ovsdb will not quit. > - > -on_exit 'pkill test-ovsdb' > -test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s3.ovsdb '[["idltest", > - {"op": "wait", > - "table": "simple", > - "where": [["i", "==", 1]], > - "columns": ["i"], > - "until": "==", > - "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 & > - > -OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log]) > - > -# Shutdown the other 2 servers so that s3 is disconnected from the cluster. > -for i in 2 1; do > - OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) > -done > - > -# The test-ovsdb should detect the disconnect and retry. > -OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log]) > - > -OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s3], [s3.pid]) > + OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log]) > > + # Shutdown the other servers so that $target is disconnected from the cluster. > + for i in $shutdown; do > + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) > + done > + > + # The test-ovsdb should detect the disconnect and retry. > + OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log]) > + > + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$target], [s$target.pid]) > +} > +OVS_END_SHELL_HELPERS > + > +AT_SETUP([OVSDB cluster - follower disconnect from cluster, single remote]) > +AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) > +ovsdb_test_cluster_disconnect follower > +AT_CLEANUP > + > +AT_SETUP([OVSDB cluster - leader disconnect from cluster, single remote]) > +AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) > +ovsdb_test_cluster_disconnect leader > AT_CLEANUP > + > > > OVS_START_SHELL_HELPERS > -- > 2.1.0 > Sorry that I forgot to add: Reported-by: Aliasgar Ginwala <aginwala@ebay.com> Tested-by: Aliasgar Ginwala <aginwala@ebay.com> I will wait for comments and add it to v2 (if v2 is needed).
diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h index 35a3dd7..fb7e6e3 100644 --- a/ovsdb/raft-private.h +++ b/ovsdb/raft-private.h @@ -80,6 +80,9 @@ struct raft_server { uint64_t next_index; /* Index of next log entry to send this server. */ uint64_t match_index; /* Index of max log entry server known to have. */ enum raft_server_phase phase; + bool replied; /* Reply to append_request was received from this + node during current election_timeout interval. + */ /* For use in adding and removing servers: */ struct uuid requester_sid; /* Nonzero if requested via RPC. */ struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */ diff --git a/ovsdb/raft.c b/ovsdb/raft.c index 63c3081..9516a6f 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -1792,7 +1792,43 @@ raft_run(struct raft *raft) } if (!raft->joining && time_msec() >= raft->election_timeout) { - raft_start_election(raft, false); + if (raft->role == RAFT_LEADER) { + /* Check if majority of followers replied, then reset + * election_timeout and reset s->replied. Otherwise, become + * follower. + * + * Raft paper section 6.2: Leaders: A server might be in the leader + * state, but if it isn’t the current leader, it could be + * needlessly delaying client requests. For example, suppose a + * leader is partitioned from the rest of the cluster, but it can + * still communicate with a particular client. Without additional + * mechanism, it could delay a request from that client forever, + * being unable to replicate a log entry to any other servers. + * Meanwhile, there might be another leader of a newer term that is + * able to communicate with a majority of the cluster and would be + * able to commit the client’s request. Thus, a leader in Raft + * steps down if an election timeout elapses without a successful + * round of heartbeats to a majority of its cluster; this allows + * clients to retry their requests with another server. */ + int count = 0; + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { + if (server->replied) { + count ++; + } + } + if (count >= hmap_count(&raft->servers) / 2) { + HMAP_FOR_EACH (server, hmap_node, &raft->servers) { + server->replied = false; + } + raft_reset_election_timer(raft); + } else { + raft_become_follower(raft); + raft_start_election(raft, false); + } + } else { + raft_start_election(raft, false); + } + } if (raft->leaving && time_msec() >= raft->leave_timeout) { @@ -2454,6 +2490,7 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s) s->next_index = raft->log_end; s->match_index = 0; s->phase = RAFT_PHASE_STABLE; + s->replied = false; } static void @@ -2477,7 +2514,7 @@ raft_become_leader(struct raft *raft) ovs_assert(raft->role != RAFT_LEADER); raft->role = RAFT_LEADER; raft_set_leader(raft, &raft->sid); - raft->election_timeout = LLONG_MAX; + raft_reset_election_timer(raft); raft_reset_ping_timer(raft); struct raft_server *s; @@ -3207,6 +3244,7 @@ raft_handle_append_reply(struct raft *raft, } } + s->replied = true; if (rpy->result == RAFT_APPEND_OK) { /* Figure 3.1: "If successful, update nextIndex and matchIndex for * follower (section 3.5)." */ diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at index 6a13843..7146fe6 100644 --- a/tests/ovsdb-cluster.at +++ b/tests/ovsdb-cluster.at @@ -65,59 +65,82 @@ EXECUTION_EXAMPLES AT_BANNER([OVSDB - disconnect from cluster]) -# When a node is disconnected from the cluster, the IDL should disconnect -# and retry even if it uses a single remote, because the remote IP can be -# a VIP on a load-balance. -AT_SETUP([OVSDB cluster - disconnect from cluster, single remote]) -AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) +OVS_START_SHELL_HELPERS +# ovsdb_test_cluster_disconnect LEADER_OR_FOLLOWER +ovsdb_test_cluster_disconnect () { + leader_or_follower=$1 + schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` + ordinal_schema > schema + AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) + cid=`ovsdb-tool db-cid s1.db` + schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` + for i in `seq 2 3`; do + AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) + done + + on_exit 'kill `cat *.pid`' + for i in `seq 3`; do + AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) + done + for i in `seq 3`; do + AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) + done + + AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", + {"op": "insert", + "table": "simple", + "row": {"i": 1}}]]'], [0], [ignore], [ignore]) + + # When a node is disconnected from the cluster, the IDL should disconnect + # and retry even if it uses a single remote, because the remote IP can be + # a VIP on a load-balance. So we use single remote to test here. + if test $leader_or_follower == "leader"; then + target=1 + shutdown="2 3" + else + target=3 + + # shutdown follower before the leader so that there is no chance for s3 + # become leader during the process. + shutdown="2 1" + fi + + # Connect to $target. Use "wait" to trigger a non-op transaction so + # that test-ovsdb will not quit. + + on_exit 'pkill test-ovsdb' + test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s$target.ovsdb '[["idltest", + {"op": "wait", + "table": "simple", + "where": [["i", "==", 1]], + "columns": ["i"], + "until": "==", + "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 & -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -ordinal_schema > schema -AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) -cid=`ovsdb-tool db-cid s1.db` -schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -for i in `seq 2 3`; do - AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) -done - -on_exit 'kill `cat *.pid`' -for i in `seq 3`; do - AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) -done -for i in `seq 3`; do - AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) -done - -AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", - {"op": "insert", - "table": "simple", - "row": {"i": 1}}]]'], [0], [ignore], [ignore]) - -# Connect to a single remote s3. Use "wait" to trigger a non-op transaction so -# that test-ovsdb will not quit. - -on_exit 'pkill test-ovsdb' -test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -v -t10 idl unix:s3.ovsdb '[["idltest", - {"op": "wait", - "table": "simple", - "where": [["i", "==", 1]], - "columns": ["i"], - "until": "==", - "rows": [{"i": 1}]}]]' > test-ovsdb.log 2>&1 & - -OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log]) - -# Shutdown the other 2 servers so that s3 is disconnected from the cluster. -for i in 2 1; do - OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) -done - -# The test-ovsdb should detect the disconnect and retry. -OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log]) - -OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s3], [s3.pid]) + OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log]) + # Shutdown the other servers so that $target is disconnected from the cluster. + for i in $shutdown; do + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) + done + + # The test-ovsdb should detect the disconnect and retry. + OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log]) + + OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$target], [s$target.pid]) +} +OVS_END_SHELL_HELPERS + +AT_SETUP([OVSDB cluster - follower disconnect from cluster, single remote]) +AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) +ovsdb_test_cluster_disconnect follower +AT_CLEANUP + +AT_SETUP([OVSDB cluster - leader disconnect from cluster, single remote]) +AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) +ovsdb_test_cluster_disconnect leader AT_CLEANUP + OVS_START_SHELL_HELPERS