diff mbox series

[ovs-dev] ovn-controller: Unix command debug/delay-nb-cfg-report.

Message ID 1599595835-124391-1-git-send-email-hzhou@ovn.org
State Accepted
Headers show
Series [ovs-dev] ovn-controller: Unix command debug/delay-nb-cfg-report. | expand

Commit Message

Han Zhou Sept. 8, 2020, 8:10 p.m. UTC
This command is added to avoid the flooding of nb_cfg updates from
a large nubmer of hypervisors interfering with the original SB DB
data distribution and handling, so that the e2e control plane latency
can be measured more accurately during scale testing with the command
    ovn-nbctl --print-wait-time --wait=hv ...

Without this, there can be hypervisors updating back to SB before the
SB server finishes sending notifications to the rest of hypervisors.
In my test with 3K HVs, delaying the report for 2 seconds is enough to
get accurate latency report.

Signed-off-by: Han Zhou <hzhou@ovn.org>
---
This patch is a revision of the last patch in a previous patch series to
address comments from Numan. The other patches in that series have been
merged, so this patch is sent as a new patch.

 controller/ovn-controller.8.xml |  9 +++++++++
 controller/ovn-controller.c     | 30 ++++++++++++++++++++++++++++++
 tests/ovn-controller.at         | 30 ++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

Comments

Numan Siddique Sept. 10, 2020, 3:38 p.m. UTC | #1
On Wed, Sep 9, 2020 at 1:41 AM Han Zhou <hzhou@ovn.org> wrote:
>
> This command is added to avoid the flooding of nb_cfg updates from
> a large nubmer of hypervisors interfering with the original SB DB
> data distribution and handling, so that the e2e control plane latency
> can be measured more accurately during scale testing with the command
>     ovn-nbctl --print-wait-time --wait=hv ...
>
> Without this, there can be hypervisors updating back to SB before the
> SB server finishes sending notifications to the rest of hypervisors.
> In my test with 3K HVs, delaying the report for 2 seconds is enough to
> get accurate latency report.
>
> Signed-off-by: Han Zhou <hzhou@ovn.org>

Acked-by: Numan Siddique <numans@ovn.org>

Note: You have to rebase before applying.

Thanks
Numan

> ---
> This patch is a revision of the last patch in a previous patch series to
> address comments from Numan. The other patches in that series have been
> merged, so this patch is sent as a new patch.
>
>  controller/ovn-controller.8.xml |  9 +++++++++
>  controller/ovn-controller.c     | 30 ++++++++++++++++++++++++++++++
>  tests/ovn-controller.at         | 30 ++++++++++++++++++++++++++++++
>  3 files changed, 69 insertions(+)
>
> diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
> index 6687731..16bc47b 100644
> --- a/controller/ovn-controller.8.xml
> +++ b/controller/ovn-controller.8.xml
> @@ -507,6 +507,15 @@
>          local index so that it can interact with the southbound database again.
>        </p>
>        </dd>
> +
> +      <dt><code>debug/delay-nb-cfg-report</code> <var>seconds</var></dt>
> +      <dd>
> +        This command is used to delay ovn-controller updating the
> +        <code>nb_cfg</code> back to <code>OVN_Southbound</code> database.  This
> +        is useful when <code>ovn-nbctl --wait=hv</code> is used to measure
> +        end-to-end latency in a large scale environment.  See
> +        <code>ovn-nbctl</code>(8) for more details.
> +      </dd>
>        </dl>
>      </p>
>
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 674a094..e2e9095 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -76,6 +76,7 @@ static unixctl_cb_func cluster_state_reset_cmd;
>  static unixctl_cb_func debug_pause_execution;
>  static unixctl_cb_func debug_resume_execution;
>  static unixctl_cb_func debug_status_execution;
> +static unixctl_cb_func debug_delay_nb_cfg_report;
>
>  #define DEFAULT_BRIDGE_NAME "br-int"
>  #define DEFAULT_PROBE_INTERVAL_MSEC 5000
> @@ -2360,6 +2361,10 @@ main(int argc, char *argv[])
>      unixctl_command_register("debug/status", "", 0, 0, debug_status_execution,
>                               &paused);
>
> +    unsigned int delay_nb_cfg_report = 0;
> +    unixctl_command_register("debug/delay-nb-cfg-report", "SECONDS", 1, 1,
> +                             debug_delay_nb_cfg_report, &delay_nb_cfg_report);
> +
>      unsigned int ovs_cond_seqno = UINT_MAX;
>      unsigned int ovnsb_cond_seqno = UINT_MAX;
>
> @@ -2571,6 +2576,10 @@ main(int argc, char *argv[])
>                      sbrec_chassis_private_set_nb_cfg(chassis_private, cur_cfg);
>                      sbrec_chassis_private_set_nb_cfg_timestamp(
>                          chassis_private, time_wall_msec());
> +                    if (delay_nb_cfg_report) {
> +                        VLOG_INFO("Sleep for %u sec", delay_nb_cfg_report);
> +                        xsleep(delay_nb_cfg_report);
> +                    }
>                  }
>              }
>
> @@ -2927,3 +2936,24 @@ debug_status_execution(struct unixctl_conn *conn, int argc OVS_UNUSED,
>          unixctl_command_reply(conn, "running");
>      }
>  }
> +
> +static void
> +debug_delay_nb_cfg_report(struct unixctl_conn *conn, int argc OVS_UNUSED,
> +                          const char *argv[], void *delay_)
> +{
> +    unsigned int *delay = delay_;
> +
> +    if (!str_to_uint(argv[1], 10, delay)) {
> +        unixctl_command_reply_error(conn, "unsigned integer required");
> +        return;
> +    }
> +
> +    char *msg;
> +    if (*delay) {
> +        msg = xasprintf("delay nb_cfg report for %u seconds.", *delay);
> +        unixctl_command_reply(conn, msg);
> +        free(msg);
> +    } else {
> +        unixctl_command_reply(conn, "no delay for nb_cfg report.");
> +    }
> +}
> diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
> index 812946b..7ed5216 100644
> --- a/tests/ovn-controller.at
> +++ b/tests/ovn-controller.at
> @@ -382,3 +382,33 @@ OVS_WAIT_UNTIL([test x0 = x`ovn-sbctl --columns nb_cfg --bare find chassis`])
>
>  OVN_CLEANUP([hv])
>  AT_CLEANUP
> +
> +# Test unix command: debug/delay-nb-cfg-report
> +AT_SETUP([ovn-controller - debug/delay-nb-cfg-report])
> +AT_KEYWORDS([ovn])
> +ovn_start
> +
> +net_add n1
> +sim_add hv
> +as hv
> +ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.1
> +
> +OVS_WAIT_UNTIL([test xhv = x`ovn-sbctl --columns name --bare find chassis`])
> +
> +AT_CHECK([ovn-appctl -t ovn-controller debug/delay-nb-cfg-report 2], [0],
> +         [delay nb_cfg report for 2 seconds.
> +])
> +
> +AT_FAIL_IF([ovn-nbctl --timeout=1 --wait=hv sync])
> +
> +AT_CHECK([ovn-nbctl --timeout=3 --wait=hv sync])
> +
> +AT_CHECK([ovn-appctl -t ovn-controller debug/delay-nb-cfg-report 0], [0],
> +         [no delay for nb_cfg report.
> +])
> +
> +AT_CHECK([ovn-nbctl --timeout=1 --wait=hv sync])
> +
> +OVN_CLEANUP([hv])
> +AT_CLEANUP
> --
> 2.1.0
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Han Zhou Sept. 10, 2020, 5:05 p.m. UTC | #2
On Thu, Sep 10, 2020 at 8:39 AM Numan Siddique <numans@ovn.org> wrote:
>
> On Wed, Sep 9, 2020 at 1:41 AM Han Zhou <hzhou@ovn.org> wrote:
> >
> > This command is added to avoid the flooding of nb_cfg updates from
> > a large nubmer of hypervisors interfering with the original SB DB
> > data distribution and handling, so that the e2e control plane latency
> > can be measured more accurately during scale testing with the command
> >     ovn-nbctl --print-wait-time --wait=hv ...
> >
> > Without this, there can be hypervisors updating back to SB before the
> > SB server finishes sending notifications to the rest of hypervisors.
> > In my test with 3K HVs, delaying the report for 2 seconds is enough to
> > get accurate latency report.
> >
> > Signed-off-by: Han Zhou <hzhou@ovn.org>
>
> Acked-by: Numan Siddique <numans@ovn.org>
>
> Note: You have to rebase before applying.
>
Thanks Numan. I applied this to master.
Ilya Maximets Sept. 10, 2020, 6:04 p.m. UTC | #3
On 9/10/20 7:05 PM, Han Zhou wrote:
> On Thu, Sep 10, 2020 at 8:39 AM Numan Siddique <numans@ovn.org> wrote:
>>
>> On Wed, Sep 9, 2020 at 1:41 AM Han Zhou <hzhou@ovn.org> wrote:
>>>
>>> This command is added to avoid the flooding of nb_cfg updates from
>>> a large nubmer of hypervisors interfering with the original SB DB
>>> data distribution and handling, so that the e2e control plane latency
>>> can be measured more accurately during scale testing with the command
>>>     ovn-nbctl --print-wait-time --wait=hv ...
>>>
>>> Without this, there can be hypervisors updating back to SB before the
>>> SB server finishes sending notifications to the rest of hypervisors.
>>> In my test with 3K HVs, delaying the report for 2 seconds is enough to
>>> get accurate latency report.
>>>
>>> Signed-off-by: Han Zhou <hzhou@ovn.org>
>>
>> Acked-by: Numan Siddique <numans@ovn.org>
>>
>> Note: You have to rebase before applying.
>>
> Thanks Numan. I applied this to master.

The test in this patch is not stable and fails on travis from time to time:
  https://travis-ci.org/github/ovn-org/ovn/jobs/726006756#L5318

Probably, 3 seconds is not enough, or maybe there is a real bug somewhere.

Best regards, Ilya Maximets.
Han Zhou Sept. 10, 2020, 6:19 p.m. UTC | #4
On Thu, Sep 10, 2020 at 11:04 AM Ilya Maximets <i.maximets@ovn.org> wrote:

> On 9/10/20 7:05 PM, Han Zhou wrote:
> > On Thu, Sep 10, 2020 at 8:39 AM Numan Siddique <numans@ovn.org> wrote:
> >>
> >> On Wed, Sep 9, 2020 at 1:41 AM Han Zhou <hzhou@ovn.org> wrote:
> >>>
> >>> This command is added to avoid the flooding of nb_cfg updates from
> >>> a large nubmer of hypervisors interfering with the original SB DB
> >>> data distribution and handling, so that the e2e control plane latency
> >>> can be measured more accurately during scale testing with the command
> >>>     ovn-nbctl --print-wait-time --wait=hv ...
> >>>
> >>> Without this, there can be hypervisors updating back to SB before the
> >>> SB server finishes sending notifications to the rest of hypervisors.
> >>> In my test with 3K HVs, delaying the report for 2 seconds is enough to
> >>> get accurate latency report.
> >>>
> >>> Signed-off-by: Han Zhou <hzhou@ovn.org>
> >>
> >> Acked-by: Numan Siddique <numans@ovn.org>
> >>
> >> Note: You have to rebase before applying.
> >>
> > Thanks Numan. I applied this to master.
>
> The test in this patch is not stable and fails on travis from time to time:
>   https://travis-ci.org/github/ovn-org/ovn/jobs/726006756#L5318
>
> Probably, 3 seconds is not enough, or maybe there is a real bug somewhere.
>
> Best regards, Ilya Maximets.
>

Thanks Ilya, I just sent a fix:
https://patchwork.ozlabs.org/project/ovn/patch/1599761867-122691-1-git-send-email-hzhou@ovn.org/
Could you take a look? Sorry for the inconvenience.

Han
diff mbox series

Patch

diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
index 6687731..16bc47b 100644
--- a/controller/ovn-controller.8.xml
+++ b/controller/ovn-controller.8.xml
@@ -507,6 +507,15 @@ 
         local index so that it can interact with the southbound database again.
       </p>
       </dd>
+
+      <dt><code>debug/delay-nb-cfg-report</code> <var>seconds</var></dt>
+      <dd>
+        This command is used to delay ovn-controller updating the
+        <code>nb_cfg</code> back to <code>OVN_Southbound</code> database.  This
+        is useful when <code>ovn-nbctl --wait=hv</code> is used to measure
+        end-to-end latency in a large scale environment.  See
+        <code>ovn-nbctl</code>(8) for more details.
+      </dd>
       </dl>
     </p>
 
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 674a094..e2e9095 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -76,6 +76,7 @@  static unixctl_cb_func cluster_state_reset_cmd;
 static unixctl_cb_func debug_pause_execution;
 static unixctl_cb_func debug_resume_execution;
 static unixctl_cb_func debug_status_execution;
+static unixctl_cb_func debug_delay_nb_cfg_report;
 
 #define DEFAULT_BRIDGE_NAME "br-int"
 #define DEFAULT_PROBE_INTERVAL_MSEC 5000
@@ -2360,6 +2361,10 @@  main(int argc, char *argv[])
     unixctl_command_register("debug/status", "", 0, 0, debug_status_execution,
                              &paused);
 
+    unsigned int delay_nb_cfg_report = 0;
+    unixctl_command_register("debug/delay-nb-cfg-report", "SECONDS", 1, 1,
+                             debug_delay_nb_cfg_report, &delay_nb_cfg_report);
+
     unsigned int ovs_cond_seqno = UINT_MAX;
     unsigned int ovnsb_cond_seqno = UINT_MAX;
 
@@ -2571,6 +2576,10 @@  main(int argc, char *argv[])
                     sbrec_chassis_private_set_nb_cfg(chassis_private, cur_cfg);
                     sbrec_chassis_private_set_nb_cfg_timestamp(
                         chassis_private, time_wall_msec());
+                    if (delay_nb_cfg_report) {
+                        VLOG_INFO("Sleep for %u sec", delay_nb_cfg_report);
+                        xsleep(delay_nb_cfg_report);
+                    }
                 }
             }
 
@@ -2927,3 +2936,24 @@  debug_status_execution(struct unixctl_conn *conn, int argc OVS_UNUSED,
         unixctl_command_reply(conn, "running");
     }
 }
+
+static void
+debug_delay_nb_cfg_report(struct unixctl_conn *conn, int argc OVS_UNUSED,
+                          const char *argv[], void *delay_)
+{
+    unsigned int *delay = delay_;
+
+    if (!str_to_uint(argv[1], 10, delay)) {
+        unixctl_command_reply_error(conn, "unsigned integer required");
+        return;
+    }
+
+    char *msg;
+    if (*delay) {
+        msg = xasprintf("delay nb_cfg report for %u seconds.", *delay);
+        unixctl_command_reply(conn, msg);
+        free(msg);
+    } else {
+        unixctl_command_reply(conn, "no delay for nb_cfg report.");
+    }
+}
diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
index 812946b..7ed5216 100644
--- a/tests/ovn-controller.at
+++ b/tests/ovn-controller.at
@@ -382,3 +382,33 @@  OVS_WAIT_UNTIL([test x0 = x`ovn-sbctl --columns nb_cfg --bare find chassis`])
 
 OVN_CLEANUP([hv])
 AT_CLEANUP
+
+# Test unix command: debug/delay-nb-cfg-report
+AT_SETUP([ovn-controller - debug/delay-nb-cfg-report])
+AT_KEYWORDS([ovn])
+ovn_start
+
+net_add n1
+sim_add hv
+as hv
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+OVS_WAIT_UNTIL([test xhv = x`ovn-sbctl --columns name --bare find chassis`])
+
+AT_CHECK([ovn-appctl -t ovn-controller debug/delay-nb-cfg-report 2], [0],
+         [delay nb_cfg report for 2 seconds.
+])
+
+AT_FAIL_IF([ovn-nbctl --timeout=1 --wait=hv sync])
+
+AT_CHECK([ovn-nbctl --timeout=3 --wait=hv sync])
+
+AT_CHECK([ovn-appctl -t ovn-controller debug/delay-nb-cfg-report 0], [0],
+         [no delay for nb_cfg report.
+])
+
+AT_CHECK([ovn-nbctl --timeout=1 --wait=hv sync])
+
+OVN_CLEANUP([hv])
+AT_CLEANUP