[ovs-dev] ovn-controller: Fix crash when sending GARP when openflow disconnection.

Message ID 20180212081747.1404-1-ligs@dtdream.com
State Changes Requested
Headers show
Series
  • [ovs-dev] ovn-controller: Fix crash when sending GARP when openflow disconnection.
Related show

Commit Message

Guoshuai Li Feb. 12, 2018, 8:17 a.m.
This is call stack:
Program received signal SIGABRT, Aborted.
1  0x00007ffff6a4f8e8 in __GI_abort () at abort.c:90
2  0x00000000004765d6 in ofputil_protocol_to_ofp_version (protocol=<optimized out>) at lib/ofp-util.c:769
3  0x000000000047c19e in ofputil_encode_packet_out (po=po@entry=0x7fffffffa0e0, protocol=<optimized out>) at lib/ofp-util.c:7060
4  0x0000000000410870 in send_garp (garp=0x83cfe0, current_time=current_time@entry=1200375400) at ovn/controller/pinctrl.c:1738
5  0x000000000041430f in send_garp_run (active_tunnels=<optimized out>, local_datapaths=0x7fffffffc0a0, chassis_index=<optimized out>, chassis=0x8194d0, br_int=<optimized out>, ctx=0x7fffffffc080) at ovn/controller/pinctrl.c:2069

Signed-off-by: Guoshuai Li <ligs@dtdream.com>
---
 ovn/controller/pinctrl.c | 40 ++++++++++++++++++++++------------------
 tests/ovn.at             | 14 ++++++++++++++
 2 files changed, 36 insertions(+), 18 deletions(-)

Comments

Mark Michelson Feb. 12, 2018, 3:18 p.m. | #1
I'm a little hesitant about the `sleep 3` in the test. However, I think 
it should be fine given the nature of the test and the hard-coded garp 
backoff timer.

Acked-by: Mark Michelson <mmichels@redhat.com>

On 02/12/2018 02:17 AM, Guoshuai Li wrote:
> This is call stack:
> Program received signal SIGABRT, Aborted.
> 1  0x00007ffff6a4f8e8 in __GI_abort () at abort.c:90
> 2  0x00000000004765d6 in ofputil_protocol_to_ofp_version (protocol=<optimized out>) at lib/ofp-util.c:769
> 3  0x000000000047c19e in ofputil_encode_packet_out (po=po@entry=0x7fffffffa0e0, protocol=<optimized out>) at lib/ofp-util.c:7060
> 4  0x0000000000410870 in send_garp (garp=0x83cfe0, current_time=current_time@entry=1200375400) at ovn/controller/pinctrl.c:1738
> 5  0x000000000041430f in send_garp_run (active_tunnels=<optimized out>, local_datapaths=0x7fffffffc0a0, chassis_index=<optimized out>, chassis=0x8194d0, br_int=<optimized out>, ctx=0x7fffffffc080) at ovn/controller/pinctrl.c:2069
> 
> Signed-off-by: Guoshuai Li <ligs@dtdream.com>
> ---
>   ovn/controller/pinctrl.c | 40 ++++++++++++++++++++++------------------
>   tests/ovn.at             | 14 ++++++++++++++
>   2 files changed, 36 insertions(+), 18 deletions(-)
> 
> diff --git a/ovn/controller/pinctrl.c b/ovn/controller/pinctrl.c
> index 14c95ff54..9537735cf 100644
> --- a/ovn/controller/pinctrl.c
> +++ b/ovn/controller/pinctrl.c
> @@ -1071,27 +1071,31 @@ pinctrl_run(struct controller_ctx *ctx,
>   
>       rconn_run(swconn);
>   
> -    if (rconn_is_connected(swconn)) {
> -        if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
> -            pinctrl_setup(swconn);
> -            conn_seq_no = rconn_get_connection_seqno(swconn);
> -            flush_put_mac_bindings();
> -        }
> -
> -        /* Process a limited number of messages per call. */
> -        for (int i = 0; i < 50; i++) {
> -            struct ofpbuf *msg = rconn_recv(swconn);
> -            if (!msg) {
> -                break;
> -            }
> +    if (!rconn_is_connected(swconn)) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl, "%s is not connected.", rconn_get_target(swconn));
> +        return;
> +    }
>   
> -            const struct ofp_header *oh = msg->data;
> -            enum ofptype type;
> +    if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
> +        pinctrl_setup(swconn);
> +        conn_seq_no = rconn_get_connection_seqno(swconn);
> +        flush_put_mac_bindings();
> +    }
>   
> -            ofptype_decode(&type, oh);
> -            pinctrl_recv(oh, type, ctx);
> -            ofpbuf_delete(msg);
> +    /* Process a limited number of messages per call. */
> +    for (int i = 0; i < 50; i++) {
> +        struct ofpbuf *msg = rconn_recv(swconn);
> +        if (!msg) {
> +            break;
>           }
> +
> +        const struct ofp_header *oh = msg->data;
> +        enum ofptype type;
> +
> +        ofptype_decode(&type, oh);
> +        pinctrl_recv(oh, type, ctx);
> +        ofpbuf_delete(msg);
>       }
>   
>       run_put_mac_bindings(ctx);
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 00d26e757..67df0a8df 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -3566,6 +3566,20 @@ AT_CHECK([ovs-vsctl add-port br-int localvif1 -- set Interface localvif1 externa
>   echo "fffffffffffff0000000000108060001080006040001f00000000001c0a80102000000000000c0a80102" > expected
>   OVN_CHECK_PACKETS([hv/snoopvif-tx.pcap], [expected])
>   
> +# delete openflow connection.
> +as hv
> +OVS_APP_EXIT_AND_WAIT([ovs-vswitchd])
> +
> +# Wait for send Garp.
> +sleep 3
> +
> +# check ovn-controller status.
> +as hv
> +AT_CHECK([ovs-appctl -t ovn-controller version | wc -l], [0], [1
> +])
> +
> +start_daemon ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -vunixctl
> +
>   # Delete the localnet ports.
>   AT_CHECK([ovs-vsctl del-port localvif1])
>   AT_CHECK([ovn-nbctl lsp-del ln_port])
>
Ben Pfaff Feb. 15, 2018, 12:41 a.m. | #2
On Mon, Feb 12, 2018 at 04:17:47PM +0800, Guoshuai Li wrote:
> This is call stack:
> Program received signal SIGABRT, Aborted.
> 1  0x00007ffff6a4f8e8 in __GI_abort () at abort.c:90
> 2  0x00000000004765d6 in ofputil_protocol_to_ofp_version (protocol=<optimized out>) at lib/ofp-util.c:769
> 3  0x000000000047c19e in ofputil_encode_packet_out (po=po@entry=0x7fffffffa0e0, protocol=<optimized out>) at lib/ofp-util.c:7060
> 4  0x0000000000410870 in send_garp (garp=0x83cfe0, current_time=current_time@entry=1200375400) at ovn/controller/pinctrl.c:1738
> 5  0x000000000041430f in send_garp_run (active_tunnels=<optimized out>, local_datapaths=0x7fffffffc0a0, chassis_index=<optimized out>, chassis=0x8194d0, br_int=<optimized out>, ctx=0x7fffffffc080) at ovn/controller/pinctrl.c:2069
> 
> Signed-off-by: Guoshuai Li <ligs@dtdream.com>

Thanks for fixing the bug!

I think we should not log a warning about this.  It is pretty much
unavoidable that sometimes the controller will be disconnected from the
switch, especially at controller startup.  WARN level log messages
should be reserved for situations that are likely to indicate a problem.
It might make sense to log something at INFO level, but the code for
maintaining OpenFlow connections already logs plenty of INFO messages
about connecting and disconnecting, so I don't think we really need
another one here.

Like Mark, I am not sure that the test is necessary.

Will you submit a v2?

Thanks,

Ben.
Guoshuai Li Feb. 15, 2018, 11:08 a.m. | #3
> Thanks for fixing the bug!
>
> I think we should not log a warning about this.  It is pretty much
> unavoidable that sometimes the controller will be disconnected from the
> switch, especially at controller startup.  WARN level log messages
> should be reserved for situations that are likely to indicate a problem.
> It might make sense to log something at INFO level, but the code for
> maintaining OpenFlow connections already logs plenty of INFO messages
> about connecting and disconnecting, so I don't think we really need
> another one here.
I remove WARN.
> Like Mark, I am not sure that the test is necessary.
I optimized the test case.
> Will you submit a v2?
https://mail.openvswitch.org/pipermail/ovs-dev/2018-February/344491.html
>
> Thanks,
>
> Ben.

Patch

diff --git a/ovn/controller/pinctrl.c b/ovn/controller/pinctrl.c
index 14c95ff54..9537735cf 100644
--- a/ovn/controller/pinctrl.c
+++ b/ovn/controller/pinctrl.c
@@ -1071,27 +1071,31 @@  pinctrl_run(struct controller_ctx *ctx,
 
     rconn_run(swconn);
 
-    if (rconn_is_connected(swconn)) {
-        if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
-            pinctrl_setup(swconn);
-            conn_seq_no = rconn_get_connection_seqno(swconn);
-            flush_put_mac_bindings();
-        }
-
-        /* Process a limited number of messages per call. */
-        for (int i = 0; i < 50; i++) {
-            struct ofpbuf *msg = rconn_recv(swconn);
-            if (!msg) {
-                break;
-            }
+    if (!rconn_is_connected(swconn)) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_WARN_RL(&rl, "%s is not connected.", rconn_get_target(swconn));
+        return;
+    }
 
-            const struct ofp_header *oh = msg->data;
-            enum ofptype type;
+    if (conn_seq_no != rconn_get_connection_seqno(swconn)) {
+        pinctrl_setup(swconn);
+        conn_seq_no = rconn_get_connection_seqno(swconn);
+        flush_put_mac_bindings();
+    }
 
-            ofptype_decode(&type, oh);
-            pinctrl_recv(oh, type, ctx);
-            ofpbuf_delete(msg);
+    /* Process a limited number of messages per call. */
+    for (int i = 0; i < 50; i++) {
+        struct ofpbuf *msg = rconn_recv(swconn);
+        if (!msg) {
+            break;
         }
+
+        const struct ofp_header *oh = msg->data;
+        enum ofptype type;
+
+        ofptype_decode(&type, oh);
+        pinctrl_recv(oh, type, ctx);
+        ofpbuf_delete(msg);
     }
 
     run_put_mac_bindings(ctx);
diff --git a/tests/ovn.at b/tests/ovn.at
index 00d26e757..67df0a8df 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -3566,6 +3566,20 @@  AT_CHECK([ovs-vsctl add-port br-int localvif1 -- set Interface localvif1 externa
 echo "fffffffffffff0000000000108060001080006040001f00000000001c0a80102000000000000c0a80102" > expected
 OVN_CHECK_PACKETS([hv/snoopvif-tx.pcap], [expected])
 
+# delete openflow connection.
+as hv
+OVS_APP_EXIT_AND_WAIT([ovs-vswitchd])
+
+# Wait for send Garp.
+sleep 3
+
+# check ovn-controller status.
+as hv
+AT_CHECK([ovs-appctl -t ovn-controller version | wc -l], [0], [1
+])
+
+start_daemon ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -vunixctl
+
 # Delete the localnet ports.
 AT_CHECK([ovs-vsctl del-port localvif1])
 AT_CHECK([ovn-nbctl lsp-del ln_port])