diff mbox series

[ovs-dev,v2] northd: Fix multicast table full

Message ID 20220825101632.773113-1-xsimonar@redhat.com
State Superseded
Headers show
Series [ovs-dev,v2] northd: Fix multicast table full | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_ovn-kubernetes success github build: passed
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

Xavier Simonart Aug. 25, 2022, 10:16 a.m. UTC
active_v4_flows count was intialized when the northd node was computed.
However, neither sb_multicast_group nor en_sb_igmp_group causes northd
updates. Hence this count could keep increasing while processing igmp groups.

This issue was sometimes 'hidden' by northd recomputes due to lflow unable
to be incrementally processed (sb busy).

active_v4_flows is now reinitialized right before building flows
(i.e. as part of the lflow node, which is computed on igmp group changes).

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2094710

Signed-off-by: Xavier Simonart <xsimonar@redhat.com>

 ---
 v2:  - rebased on main
      - use platform independent print format
---
 northd/northd.c     |  18 +++++--
 tests/system-ovn.at | 125 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 3 deletions(-)

Comments

Dumitru Ceara Aug. 26, 2022, 1:07 p.m. UTC | #1
On 8/25/22 12:16, Xavier Simonart wrote:
> active_v4_flows count was intialized when the northd node was computed.
> However, neither sb_multicast_group nor en_sb_igmp_group causes northd
> updates. Hence this count could keep increasing while processing igmp groups.
> 
> This issue was sometimes 'hidden' by northd recomputes due to lflow unable
> to be incrementally processed (sb busy).
> 
> active_v4_flows is now reinitialized right before building flows
> (i.e. as part of the lflow node, which is computed on igmp group changes).
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2094710
> 
> Signed-off-by: Xavier Simonart <xsimonar@redhat.com>
> 

Hi Xavier,

Thanks for fixing this!

>  ---
>  v2:  - rebased on main
>       - use platform independent print format
> ---
>  northd/northd.c     |  18 +++++--
>  tests/system-ovn.at | 125 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 140 insertions(+), 3 deletions(-)
> 
> diff --git a/northd/northd.c b/northd/northd.c
> index 7e2681865..d8a9ae769 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -1051,7 +1051,12 @@ init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>      mcast_sw_info->query_max_response =
>          smap_get_ullong(&od->nbs->other_config, "mcast_query_max_response",
>                          OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
> +}
>  
> +static void
> +init_mcast_flow_count(struct ovn_datapath *od)
> +{

Calling this for an 'od' that corresponds to a logical router is
actually undefined behavior, I think.

What do you think if we add a check here, something like below?

if (od->nbr) {
    return;
}

> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>      mcast_sw_info->active_v4_flows = ATOMIC_VAR_INIT(0);
>      mcast_sw_info->active_v6_flows = ATOMIC_VAR_INIT(0);
>  }
> @@ -8368,6 +8373,10 @@ build_lswitch_ip_mcast_igmp_mld(struct ovn_igmp_group *igmp_group,
>              if (atomic_compare_exchange_strong(
>                          &mcast_sw_info->active_v4_flows, &table_size,
>                          mcast_sw_info->table_size)) {
> +                static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> +
> +                VLOG_INFO_RL(&rl, "Too many active mcast flows: %"PRIu64,
> +                             mcast_sw_info->active_v4_flows);
>                  return;
>              }
>              atomic_add(&mcast_sw_info->active_v4_flows, 1, &dummy);
> @@ -15069,6 +15078,11 @@ build_mcast_groups(struct lflow_input *input_data,
>  
>      hmap_init(mcast_groups);
>      hmap_init(igmp_groups);
> +    struct ovn_datapath *od;
> +
> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> +        init_mcast_flow_count(od);
> +    }
>  
>      HMAP_FOR_EACH (op, key_node, ports) {
>          if (op->nbrp && lrport_is_enabled(op->nbrp)) {
> @@ -15126,8 +15140,7 @@ build_mcast_groups(struct lflow_input *input_data,
>          }
>  
>          /* If the datapath value is stale, purge the group. */
> -        struct ovn_datapath *od =
> -            ovn_datapath_from_sbrec(datapaths, sb_igmp->datapath);
> +        od = ovn_datapath_from_sbrec(datapaths, sb_igmp->datapath);
>  
>          if (!od || ovn_datapath_is_stale(od)) {
>              sbrec_igmp_group_delete(sb_igmp);
> @@ -15172,7 +15185,6 @@ build_mcast_groups(struct lflow_input *input_data,
>       * IGMP groups are based on the groups learnt by their multicast enabled
>       * peers.
>       */
> -    struct ovn_datapath *od;
>      HMAP_FOR_EACH (od, key_node, datapaths) {
>  
>          if (ovs_list_is_empty(&od->mcast_info.groups)) {
> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> index 992813614..87093fbcc 100644
> --- a/tests/system-ovn.at
> +++ b/tests/system-ovn.at
> @@ -8266,3 +8266,128 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
>  
>  AT_CLEANUP
>  ])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([mcast flow count])

To match the other IGMP/IP multicast system tests we should probably add:

AT_KEYWORDS([ovnigmp IP-multicast])

Other tests that use tcpdump have an additional check:

AT_SKIP_IF([test $HAVE_TCPDUMP = no])

Not a problem of this patch, I wonder what's best: to almost silently
skip the test because tcpdump is not installed or to fail the test
because tcpdump is not installed.

> +
> +ovn_start
> +
> +OVS_TRAFFIC_VSWITCHD_START()
> +ADD_BR([br-int])
> +
> +# Set external-ids in br-int needed for ovn-controller
> +ovs-vsctl \
> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
> +
> +# Start ovn-controller
> +start_daemon ovn-controller
> +
> +check ovn-nbctl ls-add ls
> +check ovn-nbctl lsp-add ls vm1
> +check ovn-nbctl lsp-set-addresses vm1 00:00:00:00:00:01
> +check ovn-nbctl lsp-add ls vm2
> +check ovn-nbctl lsp-set-addresses vm2 00:00:00:00:00:02
> +check ovn-nbctl lsp-add ls vm3
> +check ovn-nbctl lsp-set-addresses vm3 00:00:00:00:00:03
> +
> +check ovn-nbctl set logical_switch ls other_config:mcast_querier=false other_config:mcast_snoop=true other_config:mcast_query_interval=30 other_config:mcast_eth_src=00:00:00:00:00:05 other_config:mcast_ip4_src=42.42.42.5 other_config:mcast_ip6_src=fe80::1 other_config:mcast_idle_timeout=3000
> +ovn-sbctl list ip_multicast
> +
> +wait_igmp_flows_installed()
> +{
> +    OVS_WAIT_UNTIL([ovs-ofctl dump-flows br-int table=31 | \
> +    grep 'priority=90' | grep "nw_dst=$1"])
> +}
> +
> +ADD_NAMESPACES(vm1)
> +ADD_INT([vm1], [vm1], [br-int], [42.42.42.1/24])
> +NS_CHECK_EXEC([vm1], [ip link set vm1 address 00:00:00:00:00:01], [0])
> +NS_CHECK_EXEC([vm1], [ip route add default via 42.42.42.5], [0])
> +NS_CHECK_EXEC([vm1], [ip -6 addr add 2000::1/24 dev vm1], [0])
> +NS_CHECK_EXEC([vm1], [ip -6 route add default via 2000::5], [0])
> +check ovs-vsctl set Interface vm1 external_ids:iface-id=vm1
> +
> +ADD_NAMESPACES(vm2)
> +ADD_INT([vm2], [vm2], [br-int], [42.42.42.2/24])
> +NS_CHECK_EXEC([vm2], [ip link set vm2 address 00:00:00:00:00:02], [0])
> +NS_CHECK_EXEC([vm2], [ip -6 addr add 2000::2/64 dev vm2], [0])
> +NS_CHECK_EXEC([vm2], [ip link set lo up], [0])
> +check ovs-vsctl set Interface vm2 external_ids:iface-id=vm2
> +
> +ADD_NAMESPACES(vm3)
> +NS_CHECK_EXEC([vm3], [tcpdump -n -i any -nnle > vm3.pcap 2>/dev/null &], [ignore], [ignore])

I know we used tcpdump like this before in this file but we probably
should use NETNS_DAEMONIZE() instead.

> +
> +ADD_INT([vm3], [vm3], [br-int], [42.42.42.3/24])
> +NS_CHECK_EXEC([vm3], [ip link set vm3 address 00:00:00:00:00:03], [0])
> +NS_CHECK_EXEC([vm3], [ip -6 addr add 2000::3/64 dev vm3], [0])

There's no v6 checks later on, were you planning to add some?  If not
this line can probably be removed.

> +NS_CHECK_EXEC([vm3], [ip link set lo up], [0])
> +NS_CHECK_EXEC([vm3], [ip route add default via 42.42.42.5], [0])
> +NS_CHECK_EXEC([vm3], [ip -6 route add default via 2000::5], [0])

Same here.

> +check ovs-vsctl set Interface vm3 external_ids:iface-id=vm3
> +
> +NS_CHECK_EXEC([vm2], [sysctl -w net.ipv4.igmp_max_memberships=100], [ignore], [ignore])
> +NS_CHECK_EXEC([vm3], [sysctl -w net.ipv4.igmp_max_memberships=100], [ignore], [ignore])
> +wait_for_ports_up
> +
> +NS_CHECK_EXEC([vm3], [ip addr add 228.0.0.1 dev vm3 autojoin], [0])
> +wait_igmp_flows_installed 228.0.0.1
> +
> +NS_CHECK_EXEC([vm1], [ping -q -c 3 -i 0.3 -w 2 228.0.0.1], [ignore], [ignore])
> +
> +OVS_WAIT_UNTIL([
> +    requests=`grep "ICMP echo request" -c vm3.pcap`
> +    test "${requests}" -ge "3"
> +])
> +kill $(pidof tcpdump)
> +
> +NS_CHECK_EXEC([vm3], [tcpdump -n -i any -nnleX > vm3.pcap 2>/dev/null &], [ignore], [ignore])
> +NS_CHECK_EXEC([vm2], [tcpdump -n -i any -nnleX > vm2.pcap 2>/dev/null &], [ignore], [ignore])
> +NS_CHECK_EXEC([vm1], [tcpdump -n -i any -nnleX > vm1.pcap 2>/dev/null &], [ignore], [ignore])

Same comment about using NETNS_DAEMONIZE() instead.

> +
> +for i in `seq 1 40`;do
> +    NS_CHECK_EXEC([vm2], [ip addr add 228.1.$i.1 dev vm2 autojoin &], [0])
> +    NS_CHECK_EXEC([vm3], [ip addr add 229.1.$i.1 dev vm3 autojoin &], [0])
> +    # Do not go too fast. If going fast, there is a higher chance of sb being busy, causing full recompute (engine has not run)
> +    # In this test, we do not want too many recomputes as they might hide I+I related errors
> +    sleep 0.2
> +done
> +
> +for i in `seq 1 40`;do
> +    wait_igmp_flows_installed 228.1.$i.1
> +    wait_igmp_flows_installed 229.1.$i.1
> +done
> +ovn-sbctl list multicast_group
> +
> +NS_CHECK_EXEC([vm1], [ping -q -c 3 -i 0.3 -w 2 228.1.1.1], [ignore], [ignore])
> +
> +OVS_WAIT_UNTIL([
> +    requests=`grep "ICMP echo request" -c vm2.pcap`
> +    test "${requests}" -ge "3"
> +])
> +kill $(pidof tcpdump)
> +
> +# The test could succeed thanks to a lucky northd recompute...after hitting too any flows
> +# Double check we never hit error condition
> +AT_CHECK([grep -qE 'Too many active mcast flows' northd/ovn-northd.log], [1])
> +
> +OVS_APP_EXIT_AND_WAIT([ovn-controller])
> +
> +as ovn-sb
> +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
> +
> +as ovn-nb
> +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
> +
> +as northd
> +OVS_APP_EXIT_AND_WAIT([NORTHD_TYPE])
> +
> +as
> +OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
> +/connection dropped.*/d
> +/removing policing failed: No such device/d"])
> +AT_CLEANUP
> +])

Regards,
Dumitru
diff mbox series

Patch

diff --git a/northd/northd.c b/northd/northd.c
index 7e2681865..d8a9ae769 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -1051,7 +1051,12 @@  init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
     mcast_sw_info->query_max_response =
         smap_get_ullong(&od->nbs->other_config, "mcast_query_max_response",
                         OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
+}
 
+static void
+init_mcast_flow_count(struct ovn_datapath *od)
+{
+    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
     mcast_sw_info->active_v4_flows = ATOMIC_VAR_INIT(0);
     mcast_sw_info->active_v6_flows = ATOMIC_VAR_INIT(0);
 }
@@ -8368,6 +8373,10 @@  build_lswitch_ip_mcast_igmp_mld(struct ovn_igmp_group *igmp_group,
             if (atomic_compare_exchange_strong(
                         &mcast_sw_info->active_v4_flows, &table_size,
                         mcast_sw_info->table_size)) {
+                static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+
+                VLOG_INFO_RL(&rl, "Too many active mcast flows: %"PRIu64,
+                             mcast_sw_info->active_v4_flows);
                 return;
             }
             atomic_add(&mcast_sw_info->active_v4_flows, 1, &dummy);
@@ -15069,6 +15078,11 @@  build_mcast_groups(struct lflow_input *input_data,
 
     hmap_init(mcast_groups);
     hmap_init(igmp_groups);
+    struct ovn_datapath *od;
+
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+        init_mcast_flow_count(od);
+    }
 
     HMAP_FOR_EACH (op, key_node, ports) {
         if (op->nbrp && lrport_is_enabled(op->nbrp)) {
@@ -15126,8 +15140,7 @@  build_mcast_groups(struct lflow_input *input_data,
         }
 
         /* If the datapath value is stale, purge the group. */
-        struct ovn_datapath *od =
-            ovn_datapath_from_sbrec(datapaths, sb_igmp->datapath);
+        od = ovn_datapath_from_sbrec(datapaths, sb_igmp->datapath);
 
         if (!od || ovn_datapath_is_stale(od)) {
             sbrec_igmp_group_delete(sb_igmp);
@@ -15172,7 +15185,6 @@  build_mcast_groups(struct lflow_input *input_data,
      * IGMP groups are based on the groups learnt by their multicast enabled
      * peers.
      */
-    struct ovn_datapath *od;
     HMAP_FOR_EACH (od, key_node, datapaths) {
 
         if (ovs_list_is_empty(&od->mcast_info.groups)) {
diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index 992813614..87093fbcc 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -8266,3 +8266,128 @@  OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
 
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([mcast flow count])
+
+ovn_start
+
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+check ovn-nbctl ls-add ls
+check ovn-nbctl lsp-add ls vm1
+check ovn-nbctl lsp-set-addresses vm1 00:00:00:00:00:01
+check ovn-nbctl lsp-add ls vm2
+check ovn-nbctl lsp-set-addresses vm2 00:00:00:00:00:02
+check ovn-nbctl lsp-add ls vm3
+check ovn-nbctl lsp-set-addresses vm3 00:00:00:00:00:03
+
+check ovn-nbctl set logical_switch ls other_config:mcast_querier=false other_config:mcast_snoop=true other_config:mcast_query_interval=30 other_config:mcast_eth_src=00:00:00:00:00:05 other_config:mcast_ip4_src=42.42.42.5 other_config:mcast_ip6_src=fe80::1 other_config:mcast_idle_timeout=3000
+ovn-sbctl list ip_multicast
+
+wait_igmp_flows_installed()
+{
+    OVS_WAIT_UNTIL([ovs-ofctl dump-flows br-int table=31 | \
+    grep 'priority=90' | grep "nw_dst=$1"])
+}
+
+ADD_NAMESPACES(vm1)
+ADD_INT([vm1], [vm1], [br-int], [42.42.42.1/24])
+NS_CHECK_EXEC([vm1], [ip link set vm1 address 00:00:00:00:00:01], [0])
+NS_CHECK_EXEC([vm1], [ip route add default via 42.42.42.5], [0])
+NS_CHECK_EXEC([vm1], [ip -6 addr add 2000::1/24 dev vm1], [0])
+NS_CHECK_EXEC([vm1], [ip -6 route add default via 2000::5], [0])
+check ovs-vsctl set Interface vm1 external_ids:iface-id=vm1
+
+ADD_NAMESPACES(vm2)
+ADD_INT([vm2], [vm2], [br-int], [42.42.42.2/24])
+NS_CHECK_EXEC([vm2], [ip link set vm2 address 00:00:00:00:00:02], [0])
+NS_CHECK_EXEC([vm2], [ip -6 addr add 2000::2/64 dev vm2], [0])
+NS_CHECK_EXEC([vm2], [ip link set lo up], [0])
+check ovs-vsctl set Interface vm2 external_ids:iface-id=vm2
+
+ADD_NAMESPACES(vm3)
+NS_CHECK_EXEC([vm3], [tcpdump -n -i any -nnle > vm3.pcap 2>/dev/null &], [ignore], [ignore])
+
+ADD_INT([vm3], [vm3], [br-int], [42.42.42.3/24])
+NS_CHECK_EXEC([vm3], [ip link set vm3 address 00:00:00:00:00:03], [0])
+NS_CHECK_EXEC([vm3], [ip -6 addr add 2000::3/64 dev vm3], [0])
+NS_CHECK_EXEC([vm3], [ip link set lo up], [0])
+NS_CHECK_EXEC([vm3], [ip route add default via 42.42.42.5], [0])
+NS_CHECK_EXEC([vm3], [ip -6 route add default via 2000::5], [0])
+check ovs-vsctl set Interface vm3 external_ids:iface-id=vm3
+
+NS_CHECK_EXEC([vm2], [sysctl -w net.ipv4.igmp_max_memberships=100], [ignore], [ignore])
+NS_CHECK_EXEC([vm3], [sysctl -w net.ipv4.igmp_max_memberships=100], [ignore], [ignore])
+wait_for_ports_up
+
+NS_CHECK_EXEC([vm3], [ip addr add 228.0.0.1 dev vm3 autojoin], [0])
+wait_igmp_flows_installed 228.0.0.1
+
+NS_CHECK_EXEC([vm1], [ping -q -c 3 -i 0.3 -w 2 228.0.0.1], [ignore], [ignore])
+
+OVS_WAIT_UNTIL([
+    requests=`grep "ICMP echo request" -c vm3.pcap`
+    test "${requests}" -ge "3"
+])
+kill $(pidof tcpdump)
+
+NS_CHECK_EXEC([vm3], [tcpdump -n -i any -nnleX > vm3.pcap 2>/dev/null &], [ignore], [ignore])
+NS_CHECK_EXEC([vm2], [tcpdump -n -i any -nnleX > vm2.pcap 2>/dev/null &], [ignore], [ignore])
+NS_CHECK_EXEC([vm1], [tcpdump -n -i any -nnleX > vm1.pcap 2>/dev/null &], [ignore], [ignore])
+
+for i in `seq 1 40`;do
+    NS_CHECK_EXEC([vm2], [ip addr add 228.1.$i.1 dev vm2 autojoin &], [0])
+    NS_CHECK_EXEC([vm3], [ip addr add 229.1.$i.1 dev vm3 autojoin &], [0])
+    # Do not go too fast. If going fast, there is a higher chance of sb being busy, causing full recompute (engine has not run)
+    # In this test, we do not want too many recomputes as they might hide I+I related errors
+    sleep 0.2
+done
+
+for i in `seq 1 40`;do
+    wait_igmp_flows_installed 228.1.$i.1
+    wait_igmp_flows_installed 229.1.$i.1
+done
+ovn-sbctl list multicast_group
+
+NS_CHECK_EXEC([vm1], [ping -q -c 3 -i 0.3 -w 2 228.1.1.1], [ignore], [ignore])
+
+OVS_WAIT_UNTIL([
+    requests=`grep "ICMP echo request" -c vm2.pcap`
+    test "${requests}" -ge "3"
+])
+kill $(pidof tcpdump)
+
+# The test could succeed thanks to a lucky northd recompute...after hitting too any flows
+# Double check we never hit error condition
+AT_CHECK([grep -qE 'Too many active mcast flows' northd/ovn-northd.log], [1])
+
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+as ovn-sb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as ovn-nb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as northd
+OVS_APP_EXIT_AND_WAIT([NORTHD_TYPE])
+
+as
+OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
+/connection dropped.*/d
+/removing policing failed: No such device/d"])
+AT_CLEANUP
+])