diff mbox series

[ovs-dev,v4] ovsdb raft: Sync commit index to followers without delay.

Message ID 1553061384-116689-1-git-send-email-hzhou8@ebay.com
State Superseded
Headers show
Series [ovs-dev,v4] ovsdb raft: Sync commit index to followers without delay. | expand

Commit Message

Han Zhou March 20, 2019, 5:56 a.m. UTC
From: Han Zhou <hzhou8@ebay.com>

When update is requested from follower, the leader sends AppendRequest
to all followers and wait until AppendReply received from majority, and
then it will update commit index - the new entry is regarded as committed
in raft log. However, this commit will not be notified to followers
(including the one initiated the request) until next heartbeat (ping
timeout), if no other pending requests. This results in long latency
for updates made through followers, especially when a batch of updates
are requested through the same follower.

$ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done

real    0m34.154s
user    0m0.083s
sys 0m0.250s

This patch solves the problem by sending heartbeat as soon as the commit
index is updated in leader. It also avoids unnessary heartbeat by resetting
the ping timer whenever AppendRequest is broadcasted. With this patch
the performance is improved more than 50 times in same test:

$ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done

real    0m0.564s
user    0m0.080s
sys 0m0.199s

Torture test cases are also updated because otherwise the tests will
all be skipped because of the improved performance.

Signed-off-by: Han Zhou <hzhou8@ebay.com>
---

Notes:
    v3->v4: Update torture tests again. Instead of sleeping, the size of
    transaction of each client is increased to slow down the execution so that the
    chance of parallel executions are not reduced.

 ovsdb/raft.c           | 43 +++++++++++++++++++++++++++++--------------
 tests/ovsdb-cluster.at | 16 +++++++++++-----
 2 files changed, 40 insertions(+), 19 deletions(-)

Comments

Ilya Maximets March 20, 2019, 12:02 p.m. UTC | #1
On 20.03.2019 8:56, Han Zhou wrote:
> From: Han Zhou <hzhou8@ebay.com>
> 
> When update is requested from follower, the leader sends AppendRequest
> to all followers and wait until AppendReply received from majority, and
> then it will update commit index - the new entry is regarded as committed
> in raft log. However, this commit will not be notified to followers
> (including the one initiated the request) until next heartbeat (ping
> timeout), if no other pending requests. This results in long latency
> for updates made through followers, especially when a batch of updates
> are requested through the same follower.
> 
> $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> 
> real    0m34.154s
> user    0m0.083s
> sys 0m0.250s
> 
> This patch solves the problem by sending heartbeat as soon as the commit
> index is updated in leader. It also avoids unnessary heartbeat by resetting
> the ping timer whenever AppendRequest is broadcasted. With this patch
> the performance is improved more than 50 times in same test:
> 
> $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> 
> real    0m0.564s
> user    0m0.080s
> sys 0m0.199s
> 
> Torture test cases are also updated because otherwise the tests will
> all be skipped because of the improved performance.
> 
> Signed-off-by: Han Zhou <hzhou8@ebay.com>
> ---
> 
> Notes:
>     v3->v4: Update torture tests again. Instead of sleeping, the size of
>     transaction of each client is increased to slow down the execution so that the
>     chance of parallel executions are not reduced.
> 

Unfortunately, this patch fails all the testsuite runs on TravisCI:

  https://travis-ci.org/ovsrobot/ovs/builds/508777615

And some on CirrusCI too:

  https://cirrus-ci.com/build/5201766546145280

Best regards, Ilya Maximets.

>  ovsdb/raft.c           | 43 +++++++++++++++++++++++++++++--------------
>  tests/ovsdb-cluster.at | 16 +++++++++++-----
>  2 files changed, 40 insertions(+), 19 deletions(-)
> 
> diff --git a/ovsdb/raft.c b/ovsdb/raft.c
> index eee4f33..31e9e72 100644
> --- a/ovsdb/raft.c
> +++ b/ovsdb/raft.c
> @@ -320,7 +320,8 @@ static void raft_send_append_request(struct raft *,
>  
>  static void raft_become_leader(struct raft *);
>  static void raft_become_follower(struct raft *);
> -static void raft_reset_timer(struct raft *);
> +static void raft_reset_election_timer(struct raft *);
> +static void raft_reset_ping_timer(struct raft *);
>  static void raft_send_heartbeats(struct raft *);
>  static void raft_start_election(struct raft *, bool leadership_transfer);
>  static bool raft_truncate(struct raft *, uint64_t new_end);
> @@ -376,8 +377,8 @@ raft_alloc(void)
>      hmap_init(&raft->add_servers);
>      hmap_init(&raft->commands);
>  
> -    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
> -    raft_reset_timer(raft);
> +    raft_reset_ping_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      return raft;
>  }
> @@ -865,7 +866,7 @@ raft_read_log(struct raft *raft)
>  }
>  
>  static void
> -raft_reset_timer(struct raft *raft)
> +raft_reset_election_timer(struct raft *raft)
>  {
>      unsigned int duration = (ELECTION_BASE_MSEC
>                               + random_range(ELECTION_RANGE_MSEC));
> @@ -874,6 +875,12 @@ raft_reset_timer(struct raft *raft)
>  }
>  
>  static void
> +raft_reset_ping_timer(struct raft *raft)
> +{
> +    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
> +}
> +
> +static void
>  raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
>                const struct uuid *sid, bool incoming)
>  {
> @@ -1603,7 +1610,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
>              VLOG_INFO("term %"PRIu64": starting election", raft->term);
>          }
>      }
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      struct raft_server *peer;
>      HMAP_FOR_EACH (peer, hmap_node, &raft->servers) {
> @@ -1782,8 +1789,8 @@ raft_run(struct raft *raft)
>                      raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
>                  }
>              }
> +            raft_reset_ping_timer(raft);
>          }
> -        raft->ping_timeout = time_msec() + PING_TIME_MSEC;
>      }
>  
>      /* Do this only at the end; if we did it as soon as we set raft->left or
> @@ -1963,6 +1970,7 @@ raft_command_initiate(struct raft *raft,
>              s->next_index++;
>          }
>      }
> +    raft_reset_ping_timer(raft);
>  
>      return cmd;
>  }
> @@ -2313,7 +2321,7 @@ raft_become_follower(struct raft *raft)
>      }
>  
>      raft->role = RAFT_FOLLOWER;
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      /* Notify clients about lost leadership.
>       *
> @@ -2387,6 +2395,8 @@ raft_send_heartbeats(struct raft *raft)
>                                              RAFT_CMD_INCOMPLETE, 0);
>          }
>      }
> +
> +    raft_reset_ping_timer(raft);
>  }
>  
>  /* Initializes the fields in 's' that represent the leader's view of the
> @@ -2413,7 +2423,7 @@ raft_become_leader(struct raft *raft)
>      raft->role = RAFT_LEADER;
>      raft->leader_sid = raft->sid;
>      raft->election_timeout = LLONG_MAX;
> -    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
> +    raft_reset_ping_timer(raft);
>  
>      struct raft_server *s;
>      HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
> @@ -2573,11 +2583,13 @@ raft_get_next_entry(struct raft *raft, struct uuid *eid)
>      return data;
>  }
>  
> -static void
> +/* Updates commit index in raft log. If commit index is already up-to-date
> + * it does nothing and return false, otherwise, returns true. */
> +static bool
>  raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
>  {
>      if (new_commit_index <= raft->commit_index) {
> -        return;
> +        return false;
>      }
>  
>      if (raft->role == RAFT_LEADER) {
> @@ -2610,6 +2622,7 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
>          .commit_index = raft->commit_index,
>      };
>      ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
> +    return true;
>  }
>  
>  /* This doesn't use rq->entries (but it does use rq->n_entries). */
> @@ -2809,7 +2822,7 @@ raft_handle_append_request(struct raft *raft,
>                                 "usurped leadership");
>          return;
>      }
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      /* First check for the common case, where the AppendEntries request is
>       * entirely for indexes covered by 'log_start' ... 'log_end - 1', something
> @@ -3045,7 +3058,9 @@ raft_consider_updating_commit_index(struct raft *raft)
>              }
>          }
>      }
> -    raft_update_commit_index(raft, new_commit_index);
> +    if (raft_update_commit_index(raft, new_commit_index)) {
> +        raft_send_heartbeats(raft);
> +    }
>  }
>  
>  static void
> @@ -3274,7 +3289,7 @@ raft_handle_vote_request__(struct raft *raft,
>          return false;
>      }
>  
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      return true;
>  }
> @@ -3697,7 +3712,7 @@ static bool
>  raft_handle_install_snapshot_request__(
>      struct raft *raft, const struct raft_install_snapshot_request *rq)
>  {
> -    raft_reset_timer(raft);
> +    raft_reset_election_timer(raft);
>  
>      /*
>       * Our behavior here depend on new_log_start in the snapshot compared to
> diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
> index c7f1e34..5550a19 100644
> --- a/tests/ovsdb-cluster.at
> +++ b/tests/ovsdb-cluster.at
> @@ -131,12 +131,16 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
>      done
>      export OVN_SB_DB
>  
> -    n1=10 n2=5
> +    n1=10 n2=5 n3=50
>      echo "starting $n1*$n2 ovn-sbctl processes..."
>      for i in $(seq 0 $(expr $n1 - 1) ); do
>          (for j in $(seq $n2); do
>               : > $i-$j.running
> -             run_as "ovn-sbctl($i-$j)" ovn-sbctl "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 --no-leader-only add SB_Global . external_ids $i-$j=$i-$j
> +             txn="add SB_Global . external_ids $i-$j=$i-$j"
> +             for k in $(seq $n3); do
> +                 txn="$txn -- add SB_Global . external_ids $i-$j-$k=$i-$j-$k"
> +             done
> +             run_as "ovn-sbctl($i-$j)" ovn-sbctl "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 --no-leader-only $txn
>               status=$?
>               if test $status != 0; then
>                   echo "$i-$j exited with status $status" > $i-$j:$status
> @@ -146,13 +150,12 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
>           : > $i.done)&
>      done
>      echo "...done"
> -    sleep 2
>  
>      echo "waiting for ovn-sbctl processes to exit..."
>      # Use file instead of var because code inside "while" runs in a subshell.
>      echo 0 > phase
>      i=0
> -    (while :; do echo; sleep 1; done) | while read REPLY; do
> +    (while :; do echo; sleep 0.1; done) | while read REPLY; do
>          printf "t=%2d s:" $i
>          done=0
>          for j in $(seq 0 $(expr $n1 - 1)); do
> @@ -168,7 +171,7 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
>  
>          case $(cat phase) in # (
>          0)
> -            if test $done -ge $(expr $n1 / 4); then
> +            if test $done -ge $(expr $n1 / 10); then
>                  if test $variant = kill; then
>                      stop_server $victim
>                  else
> @@ -199,6 +202,9 @@ ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
>      for i in $(seq 0 $(expr $n1 - 1) ); do
>          for j in `seq $n2`; do
>              echo "$i-$j=$i-$j"
> +            for k in `seq $n3`; do
> +                echo "$i-$j-$k=$i-$j-$k"
> +            done
>          done
>      done | sort > expout
>      AT_CHECK([ovn-sbctl --timeout=30 --log-file=finalize.log -vtimeval:off -vfile -vsyslog:off --bare get SB_Global . external-ids | tr ',' '\n' | sed 's/[[{}"" ]]//g' | sort], [0], [expout])
>
Han Zhou March 20, 2019, 3:28 p.m. UTC | #2
On Wed, Mar 20, 2019 at 5:02 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> On 20.03.2019 8:56, Han Zhou wrote:
> > From: Han Zhou <hzhou8@ebay.com>
> >
> > When update is requested from follower, the leader sends AppendRequest
> > to all followers and wait until AppendReply received from majority, and
> > then it will update commit index - the new entry is regarded as committed
> > in raft log. However, this commit will not be notified to followers
> > (including the one initiated the request) until next heartbeat (ping
> > timeout), if no other pending requests. This results in long latency
> > for updates made through followers, especially when a batch of updates
> > are requested through the same follower.
> >
> > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> >
> > real    0m34.154s
> > user    0m0.083s
> > sys 0m0.250s
> >
> > This patch solves the problem by sending heartbeat as soon as the commit
> > index is updated in leader. It also avoids unnessary heartbeat by resetting
> > the ping timer whenever AppendRequest is broadcasted. With this patch
> > the performance is improved more than 50 times in same test:
> >
> > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> >
> > real    0m0.564s
> > user    0m0.080s
> > sys 0m0.199s
> >
> > Torture test cases are also updated because otherwise the tests will
> > all be skipped because of the improved performance.
> >
> > Signed-off-by: Han Zhou <hzhou8@ebay.com>
> > ---
> >
> > Notes:
> >     v3->v4: Update torture tests again. Instead of sleeping, the size of
> >     transaction of each client is increased to slow down the execution so that the
> >     chance of parallel executions are not reduced.
> >
>
> Unfortunately, this patch fails all the testsuite runs on TravisCI:
>
>   https://travis-ci.org/ovsrobot/ovs/builds/508777615
>
> And some on CirrusCI too:
>
>   https://cirrus-ci.com/build/5201766546145280
>
> Best regards, Ilya Maximets.
>

Does the CI retry failed tests? The failed ones are some of the
torture tests in ovsdb-cluster.at, which was discussed here:
https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357373.html

Basically, the failures are real bugs that are not caused by this
patch code itself, but triggered by the test case change in this
patch.

The test cases are improved in this patch so that can now find the bug
that was not found before. To avoid CI failure, we can either merge V3
(the tests were less effective), or wait until the bug is fixed.
Ben Pfaff March 20, 2019, 4:56 p.m. UTC | #3
On Wed, Mar 20, 2019 at 08:28:50AM -0700, Han Zhou wrote:
> On Wed, Mar 20, 2019 at 5:02 AM Ilya Maximets <i.maximets@samsung.com> wrote:
> >
> > On 20.03.2019 8:56, Han Zhou wrote:
> > > From: Han Zhou <hzhou8@ebay.com>
> > >
> > > When update is requested from follower, the leader sends AppendRequest
> > > to all followers and wait until AppendReply received from majority, and
> > > then it will update commit index - the new entry is regarded as committed
> > > in raft log. However, this commit will not be notified to followers
> > > (including the one initiated the request) until next heartbeat (ping
> > > timeout), if no other pending requests. This results in long latency
> > > for updates made through followers, especially when a batch of updates
> > > are requested through the same follower.
> > >
> > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > >
> > > real    0m34.154s
> > > user    0m0.083s
> > > sys 0m0.250s
> > >
> > > This patch solves the problem by sending heartbeat as soon as the commit
> > > index is updated in leader. It also avoids unnessary heartbeat by resetting
> > > the ping timer whenever AppendRequest is broadcasted. With this patch
> > > the performance is improved more than 50 times in same test:
> > >
> > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > >
> > > real    0m0.564s
> > > user    0m0.080s
> > > sys 0m0.199s
> > >
> > > Torture test cases are also updated because otherwise the tests will
> > > all be skipped because of the improved performance.
> > >
> > > Signed-off-by: Han Zhou <hzhou8@ebay.com>
> > > ---
> > >
> > > Notes:
> > >     v3->v4: Update torture tests again. Instead of sleeping, the size of
> > >     transaction of each client is increased to slow down the execution so that the
> > >     chance of parallel executions are not reduced.
> > >
> >
> > Unfortunately, this patch fails all the testsuite runs on TravisCI:
> >
> >   https://travis-ci.org/ovsrobot/ovs/builds/508777615
> >
> > And some on CirrusCI too:
> >
> >   https://cirrus-ci.com/build/5201766546145280
> >
> > Best regards, Ilya Maximets.
> >
> 
> Does the CI retry failed tests? The failed ones are some of the
> torture tests in ovsdb-cluster.at, which was discussed here:
> https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357373.html
> 
> Basically, the failures are real bugs that are not caused by this
> patch code itself, but triggered by the test case change in this
> patch.
> 
> The test cases are improved in this patch so that can now find the bug
> that was not found before. To avoid CI failure, we can either merge V3
> (the tests were less effective), or wait until the bug is fixed.

Both of these do retry failed tests.  You can see the details from the
logs at the URLs that Ilya cited.
Han Zhou March 20, 2019, 5:44 p.m. UTC | #4
On Wed, Mar 20, 2019 at 9:56 AM Ben Pfaff <blp@ovn.org> wrote:
>
> On Wed, Mar 20, 2019 at 08:28:50AM -0700, Han Zhou wrote:
> > On Wed, Mar 20, 2019 at 5:02 AM Ilya Maximets <i.maximets@samsung.com> wrote:
> > >
> > > On 20.03.2019 8:56, Han Zhou wrote:
> > > > From: Han Zhou <hzhou8@ebay.com>
> > > >
> > > > When update is requested from follower, the leader sends AppendRequest
> > > > to all followers and wait until AppendReply received from majority, and
> > > > then it will update commit index - the new entry is regarded as committed
> > > > in raft log. However, this commit will not be notified to followers
> > > > (including the one initiated the request) until next heartbeat (ping
> > > > timeout), if no other pending requests. This results in long latency
> > > > for updates made through followers, especially when a batch of updates
> > > > are requested through the same follower.
> > > >
> > > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > > >
> > > > real    0m34.154s
> > > > user    0m0.083s
> > > > sys 0m0.250s
> > > >
> > > > This patch solves the problem by sending heartbeat as soon as the commit
> > > > index is updated in leader. It also avoids unnessary heartbeat by resetting
> > > > the ping timer whenever AppendRequest is broadcasted. With this patch
> > > > the performance is improved more than 50 times in same test:
> > > >
> > > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > > >
> > > > real    0m0.564s
> > > > user    0m0.080s
> > > > sys 0m0.199s
> > > >
> > > > Torture test cases are also updated because otherwise the tests will
> > > > all be skipped because of the improved performance.
> > > >
> > > > Signed-off-by: Han Zhou <hzhou8@ebay.com>
> > > > ---
> > > >
> > > > Notes:
> > > >     v3->v4: Update torture tests again. Instead of sleeping, the size of
> > > >     transaction of each client is increased to slow down the execution so that the
> > > >     chance of parallel executions are not reduced.
> > > >
> > >
> > > Unfortunately, this patch fails all the testsuite runs on TravisCI:
> > >
> > >   https://travis-ci.org/ovsrobot/ovs/builds/508777615
> > >
> > > And some on CirrusCI too:
> > >
> > >   https://cirrus-ci.com/build/5201766546145280
> > >
> > > Best regards, Ilya Maximets.
> > >
> >
> > Does the CI retry failed tests? The failed ones are some of the
> > torture tests in ovsdb-cluster.at, which was discussed here:
> > https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357373.html
> >
> > Basically, the failures are real bugs that are not caused by this
> > patch code itself, but triggered by the test case change in this
> > patch.
> >
> > The test cases are improved in this patch so that can now find the bug
> > that was not found before. To avoid CI failure, we can either merge V3
> > (the tests were less effective), or wait until the bug is fixed.
>
> Both of these do retry failed tests.  You can see the details from the
> logs at the URLs that Ilya cited.

Yes, checking the log again, I saw the failed torture tests are
retried once, and some of them failed again when retrying, which make
me more confident for the effectiveness of the updated test cases. I
may be distracted today but I will continue debugging tomorrow. I am
pretty confident that the bug is not related to the current patch,
because it is easy to reproduce the failures such as test 2528 and
2533 with current master applying only the torture test case change.
Ben Pfaff March 20, 2019, 6:07 p.m. UTC | #5
On Wed, Mar 20, 2019 at 10:44:55AM -0700, Han Zhou wrote:
> On Wed, Mar 20, 2019 at 9:56 AM Ben Pfaff <blp@ovn.org> wrote:
> >
> > On Wed, Mar 20, 2019 at 08:28:50AM -0700, Han Zhou wrote:
> > > On Wed, Mar 20, 2019 at 5:02 AM Ilya Maximets <i.maximets@samsung.com> wrote:
> > > >
> > > > On 20.03.2019 8:56, Han Zhou wrote:
> > > > > From: Han Zhou <hzhou8@ebay.com>
> > > > >
> > > > > When update is requested from follower, the leader sends AppendRequest
> > > > > to all followers and wait until AppendReply received from majority, and
> > > > > then it will update commit index - the new entry is regarded as committed
> > > > > in raft log. However, this commit will not be notified to followers
> > > > > (including the one initiated the request) until next heartbeat (ping
> > > > > timeout), if no other pending requests. This results in long latency
> > > > > for updates made through followers, especially when a batch of updates
> > > > > are requested through the same follower.
> > > > >
> > > > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > > > >
> > > > > real    0m34.154s
> > > > > user    0m0.083s
> > > > > sys 0m0.250s
> > > > >
> > > > > This patch solves the problem by sending heartbeat as soon as the commit
> > > > > index is updated in leader. It also avoids unnessary heartbeat by resetting
> > > > > the ping timer whenever AppendRequest is broadcasted. With this patch
> > > > > the performance is improved more than 50 times in same test:
> > > > >
> > > > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > > > >
> > > > > real    0m0.564s
> > > > > user    0m0.080s
> > > > > sys 0m0.199s
> > > > >
> > > > > Torture test cases are also updated because otherwise the tests will
> > > > > all be skipped because of the improved performance.
> > > > >
> > > > > Signed-off-by: Han Zhou <hzhou8@ebay.com>
> > > > > ---
> > > > >
> > > > > Notes:
> > > > >     v3->v4: Update torture tests again. Instead of sleeping, the size of
> > > > >     transaction of each client is increased to slow down the execution so that the
> > > > >     chance of parallel executions are not reduced.
> > > > >
> > > >
> > > > Unfortunately, this patch fails all the testsuite runs on TravisCI:
> > > >
> > > >   https://travis-ci.org/ovsrobot/ovs/builds/508777615
> > > >
> > > > And some on CirrusCI too:
> > > >
> > > >   https://cirrus-ci.com/build/5201766546145280
> > > >
> > > > Best regards, Ilya Maximets.
> > > >
> > >
> > > Does the CI retry failed tests? The failed ones are some of the
> > > torture tests in ovsdb-cluster.at, which was discussed here:
> > > https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357373.html
> > >
> > > Basically, the failures are real bugs that are not caused by this
> > > patch code itself, but triggered by the test case change in this
> > > patch.
> > >
> > > The test cases are improved in this patch so that can now find the bug
> > > that was not found before. To avoid CI failure, we can either merge V3
> > > (the tests were less effective), or wait until the bug is fixed.
> >
> > Both of these do retry failed tests.  You can see the details from the
> > logs at the URLs that Ilya cited.
> 
> Yes, checking the log again, I saw the failed torture tests are
> retried once, and some of them failed again when retrying, which make
> me more confident for the effectiveness of the updated test cases. I
> may be distracted today but I will continue debugging tomorrow. I am
> pretty confident that the bug is not related to the current patch,
> because it is easy to reproduce the failures such as test 2528 and
> 2533 with current master applying only the torture test case change.

By the way, I totally support this effort and I'm really looking forward
to applying the fixes when we figure out how to make the tests both
effective and pass in the normal case.
Han Zhou March 21, 2019, 5:03 p.m. UTC | #6
On Wed, Mar 20, 2019 at 11:07 AM Ben Pfaff <blp@ovn.org> wrote:
>
> On Wed, Mar 20, 2019 at 10:44:55AM -0700, Han Zhou wrote:
> > On Wed, Mar 20, 2019 at 9:56 AM Ben Pfaff <blp@ovn.org> wrote:
> > >
> > > On Wed, Mar 20, 2019 at 08:28:50AM -0700, Han Zhou wrote:
> > > > On Wed, Mar 20, 2019 at 5:02 AM Ilya Maximets <i.maximets@samsung.com> wrote:
> > > > >
> > > > > On 20.03.2019 8:56, Han Zhou wrote:
> > > > > > From: Han Zhou <hzhou8@ebay.com>
> > > > > >
> > > > > > When update is requested from follower, the leader sends AppendRequest
> > > > > > to all followers and wait until AppendReply received from majority, and
> > > > > > then it will update commit index - the new entry is regarded as committed
> > > > > > in raft log. However, this commit will not be notified to followers
> > > > > > (including the one initiated the request) until next heartbeat (ping
> > > > > > timeout), if no other pending requests. This results in long latency
> > > > > > for updates made through followers, especially when a batch of updates
> > > > > > are requested through the same follower.
> > > > > >
> > > > > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > > > > >
> > > > > > real    0m34.154s
> > > > > > user    0m0.083s
> > > > > > sys 0m0.250s
> > > > > >
> > > > > > This patch solves the problem by sending heartbeat as soon as the commit
> > > > > > index is updated in leader. It also avoids unnessary heartbeat by resetting
> > > > > > the ping timer whenever AppendRequest is broadcasted. With this patch
> > > > > > the performance is improved more than 50 times in same test:
> > > > > >
> > > > > > $ time for i in `seq 1 100`; do ovn-nbctl ls-add ls$i; done
> > > > > >
> > > > > > real    0m0.564s
> > > > > > user    0m0.080s
> > > > > > sys 0m0.199s
> > > > > >
> > > > > > Torture test cases are also updated because otherwise the tests will
> > > > > > all be skipped because of the improved performance.
> > > > > >
> > > > > > Signed-off-by: Han Zhou <hzhou8@ebay.com>
> > > > > > ---
> > > > > >
> > > > > > Notes:
> > > > > >     v3->v4: Update torture tests again. Instead of sleeping, the size of
> > > > > >     transaction of each client is increased to slow down the execution so that the
> > > > > >     chance of parallel executions are not reduced.
> > > > > >
> > > > >
> > > > > Unfortunately, this patch fails all the testsuite runs on TravisCI:
> > > > >
> > > > >   https://travis-ci.org/ovsrobot/ovs/builds/508777615
> > > > >
> > > > > And some on CirrusCI too:
> > > > >
> > > > >   https://cirrus-ci.com/build/5201766546145280
> > > > >
> > > > > Best regards, Ilya Maximets.
> > > > >
> > > >
> > > > Does the CI retry failed tests? The failed ones are some of the
> > > > torture tests in ovsdb-cluster.at, which was discussed here:
> > > > https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357373.html
> > > >
> > > > Basically, the failures are real bugs that are not caused by this
> > > > patch code itself, but triggered by the test case change in this
> > > > patch.
> > > >
> > > > The test cases are improved in this patch so that can now find the bug
> > > > that was not found before. To avoid CI failure, we can either merge V3
> > > > (the tests were less effective), or wait until the bug is fixed.
> > >
> > > Both of these do retry failed tests.  You can see the details from the
> > > logs at the URLs that Ilya cited.
> >
> > Yes, checking the log again, I saw the failed torture tests are
> > retried once, and some of them failed again when retrying, which make
> > me more confident for the effectiveness of the updated test cases. I
> > may be distracted today but I will continue debugging tomorrow. I am
> > pretty confident that the bug is not related to the current patch,
> > because it is easy to reproduce the failures such as test 2528 and
> > 2533 with current master applying only the torture test case change.
>
> By the way, I totally support this effort and I'm really looking forward
> to applying the fixes when we figure out how to make the tests both
> effective and pass in the normal case.

Hi Ben, I fixed a reconnection bug which was causing the client IDL
reconnect to same old server after the server is disconnected from
cluster in the torture test:
https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357443.html
When server was disconnected, it sent out monitor_cancelled message to
client, so the reconnect FSM transitioned back to ACTIVE with same old
server because of the activity on the session. After the fix, the
torture tests all passed in several runs on my laptop with current
patch V4. Please try it and let me know.

Thanks,
Han
diff mbox series

Patch

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index eee4f33..31e9e72 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -320,7 +320,8 @@  static void raft_send_append_request(struct raft *,
 
 static void raft_become_leader(struct raft *);
 static void raft_become_follower(struct raft *);
-static void raft_reset_timer(struct raft *);
+static void raft_reset_election_timer(struct raft *);
+static void raft_reset_ping_timer(struct raft *);
 static void raft_send_heartbeats(struct raft *);
 static void raft_start_election(struct raft *, bool leadership_transfer);
 static bool raft_truncate(struct raft *, uint64_t new_end);
@@ -376,8 +377,8 @@  raft_alloc(void)
     hmap_init(&raft->add_servers);
     hmap_init(&raft->commands);
 
-    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
-    raft_reset_timer(raft);
+    raft_reset_ping_timer(raft);
+    raft_reset_election_timer(raft);
 
     return raft;
 }
@@ -865,7 +866,7 @@  raft_read_log(struct raft *raft)
 }
 
 static void
-raft_reset_timer(struct raft *raft)
+raft_reset_election_timer(struct raft *raft)
 {
     unsigned int duration = (ELECTION_BASE_MSEC
                              + random_range(ELECTION_RANGE_MSEC));
@@ -874,6 +875,12 @@  raft_reset_timer(struct raft *raft)
 }
 
 static void
+raft_reset_ping_timer(struct raft *raft)
+{
+    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
+}
+
+static void
 raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
               const struct uuid *sid, bool incoming)
 {
@@ -1603,7 +1610,7 @@  raft_start_election(struct raft *raft, bool leadership_transfer)
             VLOG_INFO("term %"PRIu64": starting election", raft->term);
         }
     }
-    raft_reset_timer(raft);
+    raft_reset_election_timer(raft);
 
     struct raft_server *peer;
     HMAP_FOR_EACH (peer, hmap_node, &raft->servers) {
@@ -1782,8 +1789,8 @@  raft_run(struct raft *raft)
                     raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
                 }
             }
+            raft_reset_ping_timer(raft);
         }
-        raft->ping_timeout = time_msec() + PING_TIME_MSEC;
     }
 
     /* Do this only at the end; if we did it as soon as we set raft->left or
@@ -1963,6 +1970,7 @@  raft_command_initiate(struct raft *raft,
             s->next_index++;
         }
     }
+    raft_reset_ping_timer(raft);
 
     return cmd;
 }
@@ -2313,7 +2321,7 @@  raft_become_follower(struct raft *raft)
     }
 
     raft->role = RAFT_FOLLOWER;
-    raft_reset_timer(raft);
+    raft_reset_election_timer(raft);
 
     /* Notify clients about lost leadership.
      *
@@ -2387,6 +2395,8 @@  raft_send_heartbeats(struct raft *raft)
                                             RAFT_CMD_INCOMPLETE, 0);
         }
     }
+
+    raft_reset_ping_timer(raft);
 }
 
 /* Initializes the fields in 's' that represent the leader's view of the
@@ -2413,7 +2423,7 @@  raft_become_leader(struct raft *raft)
     raft->role = RAFT_LEADER;
     raft->leader_sid = raft->sid;
     raft->election_timeout = LLONG_MAX;
-    raft->ping_timeout = time_msec() + PING_TIME_MSEC;
+    raft_reset_ping_timer(raft);
 
     struct raft_server *s;
     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
@@ -2573,11 +2583,13 @@  raft_get_next_entry(struct raft *raft, struct uuid *eid)
     return data;
 }
 
-static void
+/* Updates commit index in raft log. If commit index is already up-to-date
+ * it does nothing and return false, otherwise, returns true. */
+static bool
 raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
 {
     if (new_commit_index <= raft->commit_index) {
-        return;
+        return false;
     }
 
     if (raft->role == RAFT_LEADER) {
@@ -2610,6 +2622,7 @@  raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
         .commit_index = raft->commit_index,
     };
     ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
+    return true;
 }
 
 /* This doesn't use rq->entries (but it does use rq->n_entries). */
@@ -2809,7 +2822,7 @@  raft_handle_append_request(struct raft *raft,
                                "usurped leadership");
         return;
     }
-    raft_reset_timer(raft);
+    raft_reset_election_timer(raft);
 
     /* First check for the common case, where the AppendEntries request is
      * entirely for indexes covered by 'log_start' ... 'log_end - 1', something
@@ -3045,7 +3058,9 @@  raft_consider_updating_commit_index(struct raft *raft)
             }
         }
     }
-    raft_update_commit_index(raft, new_commit_index);
+    if (raft_update_commit_index(raft, new_commit_index)) {
+        raft_send_heartbeats(raft);
+    }
 }
 
 static void
@@ -3274,7 +3289,7 @@  raft_handle_vote_request__(struct raft *raft,
         return false;
     }
 
-    raft_reset_timer(raft);
+    raft_reset_election_timer(raft);
 
     return true;
 }
@@ -3697,7 +3712,7 @@  static bool
 raft_handle_install_snapshot_request__(
     struct raft *raft, const struct raft_install_snapshot_request *rq)
 {
-    raft_reset_timer(raft);
+    raft_reset_election_timer(raft);
 
     /*
      * Our behavior here depend on new_log_start in the snapshot compared to
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index c7f1e34..5550a19 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -131,12 +131,16 @@  ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
     done
     export OVN_SB_DB
 
-    n1=10 n2=5
+    n1=10 n2=5 n3=50
     echo "starting $n1*$n2 ovn-sbctl processes..."
     for i in $(seq 0 $(expr $n1 - 1) ); do
         (for j in $(seq $n2); do
              : > $i-$j.running
-             run_as "ovn-sbctl($i-$j)" ovn-sbctl "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 --no-leader-only add SB_Global . external_ids $i-$j=$i-$j
+             txn="add SB_Global . external_ids $i-$j=$i-$j"
+             for k in $(seq $n3); do
+                 txn="$txn -- add SB_Global . external_ids $i-$j-$k=$i-$j-$k"
+             done
+             run_as "ovn-sbctl($i-$j)" ovn-sbctl "-vPATTERN:console:ovn-sbctl($i-$j)|%D{%H:%M:%S}|%05N|%c|%p|%m" --log-file=$i-$j.log -vfile -vsyslog:off -vtimeval:off --timeout=120 --no-leader-only $txn
              status=$?
              if test $status != 0; then
                  echo "$i-$j exited with status $status" > $i-$j:$status
@@ -146,13 +150,12 @@  ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
          : > $i.done)&
     done
     echo "...done"
-    sleep 2
 
     echo "waiting for ovn-sbctl processes to exit..."
     # Use file instead of var because code inside "while" runs in a subshell.
     echo 0 > phase
     i=0
-    (while :; do echo; sleep 1; done) | while read REPLY; do
+    (while :; do echo; sleep 0.1; done) | while read REPLY; do
         printf "t=%2d s:" $i
         done=0
         for j in $(seq 0 $(expr $n1 - 1)); do
@@ -168,7 +171,7 @@  ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
 
         case $(cat phase) in # (
         0)
-            if test $done -ge $(expr $n1 / 4); then
+            if test $done -ge $(expr $n1 / 10); then
                 if test $variant = kill; then
                     stop_server $victim
                 else
@@ -199,6 +202,9 @@  ovsdb|WARN|schema: changed 2 columns in 'OVN_Southbound' database from ephemeral
     for i in $(seq 0 $(expr $n1 - 1) ); do
         for j in `seq $n2`; do
             echo "$i-$j=$i-$j"
+            for k in `seq $n3`; do
+                echo "$i-$j-$k=$i-$j-$k"
+            done
         done
     done | sort > expout
     AT_CHECK([ovn-sbctl --timeout=30 --log-file=finalize.log -vtimeval:off -vfile -vsyslog:off --bare get SB_Global . external-ids | tr ',' '\n' | sed 's/[[{}"" ]]//g' | sort], [0], [expout])