diff mbox series

[ovs-dev,v2] controller: Fixed ovs/ovn(features) connection lost when running more than 120 seconds

Message ID 20221124140346.1026142-1-xsimonar@redhat.com
State Accepted
Headers show
Series [ovs-dev,v2] controller: Fixed ovs/ovn(features) connection lost when running more than 120 seconds | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Xavier Simonart Nov. 24, 2022, 2:03 p.m. UTC
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084

Signed-off-by: Xavier Simonart <xsimonar@redhat.com>

---
v2: - Based on Dumitru's feedback:
      - Reduce test case length by removing uggly sleep
      - Add an explicit probe for the feature connection
    - Rebased on main
---
 lib/features.c | 22 +++++++++++++++-------
 tests/ovn.at   | 31 +++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 7 deletions(-)

Comments

Ales Musil Nov. 24, 2022, 2:24 p.m. UTC | #1
On Thu, Nov 24, 2022 at 3:03 PM Xavier Simonart <xsimonar@redhat.com> wrote:

> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084
>
> Signed-off-by: Xavier Simonart <xsimonar@redhat.com>
>
> ---
> v2: - Based on Dumitru's feedback:
>       - Reduce test case length by removing uggly sleep
>       - Add an explicit probe for the feature connection
>     - Rebased on main
> ---
>  lib/features.c | 22 +++++++++++++++-------
>  tests/ovn.at   | 31 +++++++++++++++++++++++++++++++
>  2 files changed, 46 insertions(+), 7 deletions(-)
>
> diff --git a/lib/features.c b/lib/features.c
> index f15ec42bb..462b99818 100644
> --- a/lib/features.c
> +++ b/lib/features.c
> @@ -26,10 +26,13 @@
>  #include "openvswitch/rconn.h"
>  #include "openvswitch/ofp-msgs.h"
>  #include "openvswitch/ofp-meter.h"
> +#include "openvswitch/ofp-util.h"
>  #include "ovn/features.h"
>
>  VLOG_DEFINE_THIS_MODULE(features);
>
> +#define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
> +
>  struct ovs_feature {
>      enum ovs_feature_value value;
>      const char *name;
> @@ -74,7 +77,8 @@ static void
>  ovs_feature_rconn_setup(const char *br_name)
>  {
>      if (!swconn) {
> -        swconn = rconn_create(5, 0, DSCP_DEFAULT, 1 << OFP15_VERSION);
> +        swconn = rconn_create(FEATURES_DEFAULT_PROBE_INTERVAL_SEC, 0,
> +                              DSCP_DEFAULT, 1 << OFP15_VERSION);
>      }
>
>      if (!rconn_is_connected(swconn)) {
> @@ -85,11 +89,14 @@ ovs_feature_rconn_setup(const char *br_name)
>          }
>          free(target);
>      }
> +    rconn_set_probe_interval(swconn, FEATURES_DEFAULT_PROBE_INTERVAL_SEC);
>  }
>
>  static bool
>  ovs_feature_get_openflow_cap(const char *br_name)
>  {
> +    struct ofpbuf *msg;
> +
>      if (!br_name) {
>          return false;
>      }
> @@ -102,15 +109,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
>      }
>
>      /* send new requests just after reconnect. */
> -    if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
> -        return false;
> +    if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
> +        /* dump datapath meter capabilities. */
> +        msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
> +                           rconn_get_version(swconn), 0);
> +        rconn_send(swconn, msg, NULL);
>      }
>
>      bool ret = false;
> -    /* dump datapath meter capabilities. */
> -    struct ofpbuf *msg =
> ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
> -                                      rconn_get_version(swconn), 0);
> -    rconn_send(swconn, msg, NULL);
>      for (int i = 0; i < 50; i++) {
>          msg = rconn_recv(swconn);
>          if (!msg) {
> @@ -137,6 +143,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
>                  }
>              }
>              conn_seq_no = rconn_get_connection_seqno(swconn);
> +        } else if (type == OFPTYPE_ECHO_REQUEST) {
> +            rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
>          }
>          ofpbuf_delete(msg);
>      }
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 9d52b1677..0ef536509 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -33450,3 +33450,34 @@ check_drops
>  OVN_CLEANUP([hv1],[hv2])
>  AT_CLEANUP
>  ])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([feature inactivity probe])
> +ovn_start
> +net_add n1
> +
> +sim_add hv1
> +as hv1
> +check ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.1
> +
> +dnl Ensure that there are at least 3 openflow connections.
> +OVS_WAIT_UNTIL([test "$(grep -c 'negotiated OpenFlow version'
> hv1/ovs-vswitchd.log)" -eq "3"])
> +
> +dnl "Wait" 3 times 60 seconds and ensure ovn-controller writes to the
> +dnl openflow connections in the meantime.  This should allow ovs-vswitchd
> +dnl to probe the openflow connections at least twice.
> +
> +as hv1 ovs-appctl time/warp 60000
> +check ovn-nbctl --wait=hv sync
> +
> +as hv1 ovs-appctl time/warp 60000
> +check ovn-nbctl --wait=hv sync
> +
> +as hv1 ovs-appctl time/warp 60000
> +check ovn-nbctl --wait=hv sync
> +
> +AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
> +OVN_CLEANUP([hv1])
> +AT_CLEANUP
> +])
> --
> 2.31.1
>
>
Looks good to me, thanks.

Acked-by: Ales Musil <amusil@redhat.com>
Dumitru Ceara Nov. 24, 2022, 4:25 p.m. UTC | #2
On 11/24/22 15:24, Ales Musil wrote:
> On Thu, Nov 24, 2022 at 3:03 PM Xavier Simonart <xsimonar@redhat.com> wrote:
> 
>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084
>>
>> Signed-off-by: Xavier Simonart <xsimonar@redhat.com>
>>
>> ---
>> v2: - Based on Dumitru's feedback:
>>       - Reduce test case length by removing uggly sleep
>>       - Add an explicit probe for the feature connection
>>     - Rebased on main
>> ---
>>  lib/features.c | 22 +++++++++++++++-------
>>  tests/ovn.at   | 31 +++++++++++++++++++++++++++++++
>>  2 files changed, 46 insertions(+), 7 deletions(-)
>>
>> diff --git a/lib/features.c b/lib/features.c
>> index f15ec42bb..462b99818 100644
>> --- a/lib/features.c
>> +++ b/lib/features.c
>> @@ -26,10 +26,13 @@
>>  #include "openvswitch/rconn.h"
>>  #include "openvswitch/ofp-msgs.h"
>>  #include "openvswitch/ofp-meter.h"
>> +#include "openvswitch/ofp-util.h"
>>  #include "ovn/features.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(features);
>>
>> +#define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>> +
>>  struct ovs_feature {
>>      enum ovs_feature_value value;
>>      const char *name;
>> @@ -74,7 +77,8 @@ static void
>>  ovs_feature_rconn_setup(const char *br_name)
>>  {
>>      if (!swconn) {
>> -        swconn = rconn_create(5, 0, DSCP_DEFAULT, 1 << OFP15_VERSION);
>> +        swconn = rconn_create(FEATURES_DEFAULT_PROBE_INTERVAL_SEC, 0,
>> +                              DSCP_DEFAULT, 1 << OFP15_VERSION);
>>      }
>>
>>      if (!rconn_is_connected(swconn)) {
>> @@ -85,11 +89,14 @@ ovs_feature_rconn_setup(const char *br_name)
>>          }
>>          free(target);
>>      }
>> +    rconn_set_probe_interval(swconn, FEATURES_DEFAULT_PROBE_INTERVAL_SEC);
>>  }
>>
>>  static bool
>>  ovs_feature_get_openflow_cap(const char *br_name)
>>  {
>> +    struct ofpbuf *msg;
>> +
>>      if (!br_name) {
>>          return false;
>>      }
>> @@ -102,15 +109,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
>>      }
>>
>>      /* send new requests just after reconnect. */
>> -    if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
>> -        return false;
>> +    if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
>> +        /* dump datapath meter capabilities. */
>> +        msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
>> +                           rconn_get_version(swconn), 0);
>> +        rconn_send(swconn, msg, NULL);
>>      }
>>
>>      bool ret = false;
>> -    /* dump datapath meter capabilities. */
>> -    struct ofpbuf *msg =
>> ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
>> -                                      rconn_get_version(swconn), 0);
>> -    rconn_send(swconn, msg, NULL);
>>      for (int i = 0; i < 50; i++) {
>>          msg = rconn_recv(swconn);
>>          if (!msg) {
>> @@ -137,6 +143,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
>>                  }
>>              }
>>              conn_seq_no = rconn_get_connection_seqno(swconn);
>> +        } else if (type == OFPTYPE_ECHO_REQUEST) {
>> +            rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
>>          }
>>          ofpbuf_delete(msg);
>>      }
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index 9d52b1677..0ef536509 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -33450,3 +33450,34 @@ check_drops
>>  OVN_CLEANUP([hv1],[hv2])
>>  AT_CLEANUP
>>  ])
>> +
>> +OVN_FOR_EACH_NORTHD([
>> +AT_SETUP([feature inactivity probe])
>> +ovn_start
>> +net_add n1
>> +
>> +sim_add hv1
>> +as hv1
>> +check ovs-vsctl add-br br-phys
>> +ovn_attach n1 br-phys 192.168.0.1
>> +
>> +dnl Ensure that there are at least 3 openflow connections.
>> +OVS_WAIT_UNTIL([test "$(grep -c 'negotiated OpenFlow version'
>> hv1/ovs-vswitchd.log)" -eq "3"])
>> +
>> +dnl "Wait" 3 times 60 seconds and ensure ovn-controller writes to the
>> +dnl openflow connections in the meantime.  This should allow ovs-vswitchd
>> +dnl to probe the openflow connections at least twice.
>> +
>> +as hv1 ovs-appctl time/warp 60000
>> +check ovn-nbctl --wait=hv sync
>> +
>> +as hv1 ovs-appctl time/warp 60000
>> +check ovn-nbctl --wait=hv sync
>> +
>> +as hv1 ovs-appctl time/warp 60000
>> +check ovn-nbctl --wait=hv sync
>> +
>> +AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
>> +OVN_CLEANUP([hv1])
>> +AT_CLEANUP
>> +])
>> --
>> 2.31.1
>>
>>
> Looks good to me, thanks.
> 
> Acked-by: Ales Musil <amusil@redhat.com>
> 

Just so we don't forget to backport this:

Fixes: 1b587c4fad7b ("controller: add datapath meter capability check")

Thanks,
Dumitru
Dumitru Ceara Nov. 25, 2022, 1:12 p.m. UTC | #3
On 11/24/22 15:24, Ales Musil wrote:
> On Thu, Nov 24, 2022 at 3:03 PM Xavier Simonart <xsimonar@redhat.com> wrote:
> 
>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2144084
>>
>> Signed-off-by: Xavier Simonart <xsimonar@redhat.com>
>>
>> ---
>> v2: - Based on Dumitru's feedback:
>>       - Reduce test case length by removing uggly sleep
>>       - Add an explicit probe for the feature connection
>>     - Rebased on main
>> ---
>>  lib/features.c | 22 +++++++++++++++-------
>>  tests/ovn.at   | 31 +++++++++++++++++++++++++++++++
>>  2 files changed, 46 insertions(+), 7 deletions(-)
>>
>> diff --git a/lib/features.c b/lib/features.c
>> index f15ec42bb..462b99818 100644
>> --- a/lib/features.c
>> +++ b/lib/features.c
>> @@ -26,10 +26,13 @@
>>  #include "openvswitch/rconn.h"
>>  #include "openvswitch/ofp-msgs.h"
>>  #include "openvswitch/ofp-meter.h"
>> +#include "openvswitch/ofp-util.h"
>>  #include "ovn/features.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(features);
>>
>> +#define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
>> +
>>  struct ovs_feature {
>>      enum ovs_feature_value value;
>>      const char *name;
>> @@ -74,7 +77,8 @@ static void
>>  ovs_feature_rconn_setup(const char *br_name)
>>  {
>>      if (!swconn) {
>> -        swconn = rconn_create(5, 0, DSCP_DEFAULT, 1 << OFP15_VERSION);
>> +        swconn = rconn_create(FEATURES_DEFAULT_PROBE_INTERVAL_SEC, 0,
>> +                              DSCP_DEFAULT, 1 << OFP15_VERSION);
>>      }
>>
>>      if (!rconn_is_connected(swconn)) {
>> @@ -85,11 +89,14 @@ ovs_feature_rconn_setup(const char *br_name)
>>          }
>>          free(target);
>>      }
>> +    rconn_set_probe_interval(swconn, FEATURES_DEFAULT_PROBE_INTERVAL_SEC);
>>  }
>>
>>  static bool
>>  ovs_feature_get_openflow_cap(const char *br_name)
>>  {
>> +    struct ofpbuf *msg;
>> +
>>      if (!br_name) {
>>          return false;
>>      }
>> @@ -102,15 +109,14 @@ ovs_feature_get_openflow_cap(const char *br_name)
>>      }
>>
>>      /* send new requests just after reconnect. */
>> -    if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
>> -        return false;
>> +    if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
>> +        /* dump datapath meter capabilities. */
>> +        msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
>> +                           rconn_get_version(swconn), 0);
>> +        rconn_send(swconn, msg, NULL);
>>      }
>>
>>      bool ret = false;
>> -    /* dump datapath meter capabilities. */
>> -    struct ofpbuf *msg =
>> ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
>> -                                      rconn_get_version(swconn), 0);
>> -    rconn_send(swconn, msg, NULL);
>>      for (int i = 0; i < 50; i++) {
>>          msg = rconn_recv(swconn);
>>          if (!msg) {
>> @@ -137,6 +143,8 @@ ovs_feature_get_openflow_cap(const char *br_name)
>>                  }
>>              }
>>              conn_seq_no = rconn_get_connection_seqno(swconn);
>> +        } else if (type == OFPTYPE_ECHO_REQUEST) {
>> +            rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
>>          }
>>          ofpbuf_delete(msg);
>>      }
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index 9d52b1677..0ef536509 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -33450,3 +33450,34 @@ check_drops
>>  OVN_CLEANUP([hv1],[hv2])
>>  AT_CLEANUP
>>  ])
>> +
>> +OVN_FOR_EACH_NORTHD([
>> +AT_SETUP([feature inactivity probe])
>> +ovn_start
>> +net_add n1
>> +
>> +sim_add hv1
>> +as hv1
>> +check ovs-vsctl add-br br-phys
>> +ovn_attach n1 br-phys 192.168.0.1
>> +
>> +dnl Ensure that there are at least 3 openflow connections.
>> +OVS_WAIT_UNTIL([test "$(grep -c 'negotiated OpenFlow version'
>> hv1/ovs-vswitchd.log)" -eq "3"])
>> +
>> +dnl "Wait" 3 times 60 seconds and ensure ovn-controller writes to the
>> +dnl openflow connections in the meantime.  This should allow ovs-vswitchd
>> +dnl to probe the openflow connections at least twice.
>> +
>> +as hv1 ovs-appctl time/warp 60000
>> +check ovn-nbctl --wait=hv sync
>> +
>> +as hv1 ovs-appctl time/warp 60000
>> +check ovn-nbctl --wait=hv sync
>> +
>> +as hv1 ovs-appctl time/warp 60000
>> +check ovn-nbctl --wait=hv sync
>> +
>> +AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
>> +OVN_CLEANUP([hv1])
>> +AT_CLEANUP
>> +])
>> --
>> 2.31.1
>>
>>
> Looks good to me, thanks.
> 
> Acked-by: Ales Musil <amusil@redhat.com>
> 

Thanks Ales and Xavier!

Applied and backported down to 22.03.

Thanks,
Dumitru
diff mbox series

Patch

diff --git a/lib/features.c b/lib/features.c
index f15ec42bb..462b99818 100644
--- a/lib/features.c
+++ b/lib/features.c
@@ -26,10 +26,13 @@ 
 #include "openvswitch/rconn.h"
 #include "openvswitch/ofp-msgs.h"
 #include "openvswitch/ofp-meter.h"
+#include "openvswitch/ofp-util.h"
 #include "ovn/features.h"
 
 VLOG_DEFINE_THIS_MODULE(features);
 
+#define FEATURES_DEFAULT_PROBE_INTERVAL_SEC 5
+
 struct ovs_feature {
     enum ovs_feature_value value;
     const char *name;
@@ -74,7 +77,8 @@  static void
 ovs_feature_rconn_setup(const char *br_name)
 {
     if (!swconn) {
-        swconn = rconn_create(5, 0, DSCP_DEFAULT, 1 << OFP15_VERSION);
+        swconn = rconn_create(FEATURES_DEFAULT_PROBE_INTERVAL_SEC, 0,
+                              DSCP_DEFAULT, 1 << OFP15_VERSION);
     }
 
     if (!rconn_is_connected(swconn)) {
@@ -85,11 +89,14 @@  ovs_feature_rconn_setup(const char *br_name)
         }
         free(target);
     }
+    rconn_set_probe_interval(swconn, FEATURES_DEFAULT_PROBE_INTERVAL_SEC);
 }
 
 static bool
 ovs_feature_get_openflow_cap(const char *br_name)
 {
+    struct ofpbuf *msg;
+
     if (!br_name) {
         return false;
     }
@@ -102,15 +109,14 @@  ovs_feature_get_openflow_cap(const char *br_name)
     }
 
     /* send new requests just after reconnect. */
-    if (conn_seq_no == rconn_get_connection_seqno(swconn)) {
-        return false;
+    if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
+        /* dump datapath meter capabilities. */
+        msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
+                           rconn_get_version(swconn), 0);
+        rconn_send(swconn, msg, NULL);
     }
 
     bool ret = false;
-    /* dump datapath meter capabilities. */
-    struct ofpbuf *msg = ofpraw_alloc(OFPRAW_OFPST13_METER_FEATURES_REQUEST,
-                                      rconn_get_version(swconn), 0);
-    rconn_send(swconn, msg, NULL);
     for (int i = 0; i < 50; i++) {
         msg = rconn_recv(swconn);
         if (!msg) {
@@ -137,6 +143,8 @@  ovs_feature_get_openflow_cap(const char *br_name)
                 }
             }
             conn_seq_no = rconn_get_connection_seqno(swconn);
+        } else if (type == OFPTYPE_ECHO_REQUEST) {
+            rconn_send(swconn, ofputil_encode_echo_reply(oh), NULL);
         }
         ofpbuf_delete(msg);
     }
diff --git a/tests/ovn.at b/tests/ovn.at
index 9d52b1677..0ef536509 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -33450,3 +33450,34 @@  check_drops
 OVN_CLEANUP([hv1],[hv2])
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([feature inactivity probe])
+ovn_start
+net_add n1
+
+sim_add hv1
+as hv1
+check ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+dnl Ensure that there are at least 3 openflow connections.
+OVS_WAIT_UNTIL([test "$(grep -c 'negotiated OpenFlow version' hv1/ovs-vswitchd.log)" -eq "3"])
+
+dnl "Wait" 3 times 60 seconds and ensure ovn-controller writes to the
+dnl openflow connections in the meantime.  This should allow ovs-vswitchd
+dnl to probe the openflow connections at least twice.
+
+as hv1 ovs-appctl time/warp 60000
+check ovn-nbctl --wait=hv sync
+
+as hv1 ovs-appctl time/warp 60000
+check ovn-nbctl --wait=hv sync
+
+as hv1 ovs-appctl time/warp 60000
+check ovn-nbctl --wait=hv sync
+
+AT_CHECK([test -z "`grep disconnecting hv1/ovs-vswitchd.log`"])
+OVN_CLEANUP([hv1])
+AT_CLEANUP
+])