diff mbox series

[ovs-dev,2/3] northd: Fix broadcast of all traffic within a transit spine switch.

Message ID 20250212124149.3007842-3-i.maximets@ovn.org
State Accepted
Headers show
Series Fixes for broadcast behavior in Spine-Leaf topology. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Ilya Maximets Feb. 12, 2025, 12:41 p.m. UTC
Currently, OVN can't learn FDB entries from remote ports, because
learning is happening in the ingress pipeline, while only the egress
pipeline is executed in the destination availability zone.  So, the
destination zone can't learn the source MAC address and has to
broadcast replies.  The source zone also can't learn from replies,
so has to broadcast all the subsequent packets as well.

Fix that by introducing FDB learning in the egress pipeline for
remote ports.

Note: For some reason remote ports were not tracked by ovn-controller.
This wasn't a big problem because we didn't use inport/outport matches
on the traffic from remote ports in most cases.  However, FDB lookups
need to match on the input port, and ovn-controller will skip those
flows if ports are not related, so marking them as such. (The removal
is already handled in a generic way.)

Fixes: 67100f0ca819 ("ic: Add support for spine-leaf topology for transit switches.")
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 controller/binding.c    |  7 +++-
 northd/northd.c         | 17 ++++++--
 northd/northd.h         | 24 ++++++-----
 northd/ovn-northd.8.xml | 40 +++++++++++++-----
 tests/ovn-ic.at         | 93 +++++++++++++++++++++++++++++++----------
 5 files changed, 133 insertions(+), 48 deletions(-)
diff mbox series

Patch

diff --git a/controller/binding.c b/controller/binding.c
index 316b2c36b..f22959277 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -2219,7 +2219,9 @@  binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
         }
 
         case LP_REMOTE:
-            /* Nothing to be done for REMOTE type. */
+            /* Remote ports are related, since we can have rules in the
+             * egress pipeline matching on traffic coming from another AZ. */
+            update_related_lport(pb, b_ctx_out);
             break;
 
         case LP_UNKNOWN: {
@@ -3131,6 +3133,9 @@  handle_updated_port(struct binding_ctx_in *b_ctx_in,
     }
 
     case LP_REMOTE:
+        update_related_lport(pb, b_ctx_out);
+        break;
+
     case LP_UNKNOWN:
         break;
     }
diff --git a/northd/northd.c b/northd/northd.c
index cad974929..ac7d49adf 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -5763,7 +5763,9 @@  build_lswitch_learn_fdb_op(
         return;
     }
 
-    if (!strcmp(op->nbsp->type, "") || lsp_is_switch(op->nbsp)
+    bool remote = lsp_is_remote(op->nbsp);
+
+    if (remote || !strcmp(op->nbsp->type, "") || lsp_is_switch(op->nbsp)
         || (lsp_is_localnet(op->nbsp) && localnet_can_learn_mac(op->nbsp))) {
         ds_clear(match);
         ds_clear(actions);
@@ -5774,7 +5776,9 @@  build_lswitch_learn_fdb_op(
         ds_put_format(actions, REGBIT_LKUP_FDB
                       " = lookup_fdb(inport, eth.src); next;");
         ovn_lflow_add_with_lport_and_hint(lflows, op->od,
-                                          S_SWITCH_IN_LOOKUP_FDB, 100,
+                                          remote ? S_SWITCH_OUT_LOOKUP_FDB
+                                                 : S_SWITCH_IN_LOOKUP_FDB,
+                                          100,
                                           ds_cstr(match), ds_cstr(actions),
                                           op->key, &op->nbsp->header_,
                                           op->lflow_ref);
@@ -5782,7 +5786,9 @@  build_lswitch_learn_fdb_op(
         ds_put_cstr(match, " && "REGBIT_LKUP_FDB" == 0");
         ds_clear(actions);
         ds_put_cstr(actions, "put_fdb(inport, eth.src); next;");
-        ovn_lflow_add_with_lport_and_hint(lflows, op->od, S_SWITCH_IN_PUT_FDB,
+        ovn_lflow_add_with_lport_and_hint(lflows, op->od,
+                                          remote ? S_SWITCH_OUT_PUT_FDB
+                                                 : S_SWITCH_IN_PUT_FDB,
                                           100, ds_cstr(match),
                                           ds_cstr(actions), op->key,
                                           &op->nbsp->header_,
@@ -5802,6 +5808,11 @@  build_lswitch_learn_fdb_od(
                   lflow_ref);
     ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
                   "outport = get_fdb(eth.dst); next;", lflow_ref);
+
+    ovn_lflow_add(lflows, od, S_SWITCH_OUT_LOOKUP_FDB, 0, "1", "next;",
+                  lflow_ref);
+    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PUT_FDB, 0, "1", "next;",
+                  lflow_ref);
 }
 
 /* Egress tables 8: Egress port security - IP (priority 0)
diff --git a/northd/northd.h b/northd/northd.h
index 1f29645c7..e730beb3a 100644
--- a/northd/northd.h
+++ b/northd/northd.h
@@ -498,17 +498,19 @@  enum ovn_stage {
     PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    29, "ls_in_l2_unknown")    \
                                                                           \
     /* Logical switch egress stages. */                                   \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")        \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       1, "ls_out_pre_lb")         \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")   \
-    PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     3, "ls_out_acl_hint")       \
-    PIPELINE_STAGE(SWITCH, OUT, ACL_EVAL,     4, "ls_out_acl_eval")       \
-    PIPELINE_STAGE(SWITCH, OUT, ACL_SAMPLE,   5, "ls_out_acl_sample")     \
-    PIPELINE_STAGE(SWITCH, OUT, ACL_ACTION,   6, "ls_out_acl_action")     \
-    PIPELINE_STAGE(SWITCH, OUT, QOS,          7, "ls_out_qos")       \
-    PIPELINE_STAGE(SWITCH, OUT, STATEFUL,     8, "ls_out_stateful")       \
-    PIPELINE_STAGE(SWITCH, OUT, CHECK_PORT_SEC,  9, "ls_out_check_port_sec") \
-    PIPELINE_STAGE(SWITCH, OUT, APPLY_PORT_SEC, 10, "ls_out_apply_port_sec") \
+    PIPELINE_STAGE(SWITCH, OUT, LOOKUP_FDB,      0, "ls_out_lookup_fdb")     \
+    PIPELINE_STAGE(SWITCH, OUT, PUT_FDB,         1, "ls_out_put_fdb")        \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,         2, "ls_out_pre_acl")        \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,          3, "ls_out_pre_lb")         \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL,    4, "ls_out_pre_stateful")   \
+    PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,        5, "ls_out_acl_hint")       \
+    PIPELINE_STAGE(SWITCH, OUT, ACL_EVAL,        6, "ls_out_acl_eval")       \
+    PIPELINE_STAGE(SWITCH, OUT, ACL_SAMPLE,      7, "ls_out_acl_sample")     \
+    PIPELINE_STAGE(SWITCH, OUT, ACL_ACTION,      8, "ls_out_acl_action")     \
+    PIPELINE_STAGE(SWITCH, OUT, QOS,             9, "ls_out_qos")            \
+    PIPELINE_STAGE(SWITCH, OUT, STATEFUL,       10, "ls_out_stateful")       \
+    PIPELINE_STAGE(SWITCH, OUT, CHECK_PORT_SEC, 11, "ls_out_check_port_sec") \
+    PIPELINE_STAGE(SWITCH, OUT, APPLY_PORT_SEC, 12, "ls_out_apply_port_sec") \
                                                                       \
     /* Logical router ingress stages. */                              \
     PIPELINE_STAGE(ROUTER, IN,  ADMISSION,       0, "lr_in_admission")    \
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 0dc577393..75d171f77 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -2280,7 +2280,25 @@  output;
       </li>
     </ul>
 
-    <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
+    <h3>Egress Table 0: Lookup MAC address learning table</h3>
+    <p>
+      This is similar to ingress table <code>Lookup MAC address learning table
+      </code> with the difference that MAC address learning lookup is only
+      happening for ports with type <code>remote</code> whose port security is
+      disabled and 'unknown' address set.  This stage facilitates MAC learning
+      on a transit switch connecting multiple availability zones.
+    </p>
+
+    <h3>Egress Table 1: Learn MAC of 'unknown' ports.</h3>
+    <p>
+      This is similar to ingress table <code>Learn MAC of 'unknown' ports
+      </code> with the difference that MAC address learning is only happening
+      for ports with type <code>remote</code> whose port security is disabled
+      and 'unknown' address set.  This stage facilitates MAC learning on a
+      transit switch connecting multiple availability zones.
+    </p>
+
+    <h3>Egress Table 2: <code>to-lport</code> Pre-ACLs</h3>
 
     <p>
       This is similar to ingress table <code>Pre-ACLs</code> except for
@@ -2307,7 +2325,7 @@  output;
     </p>
 
 
-    <h3>Egress Table 1: Pre-LB</h3>
+    <h3>Egress Table 3: Pre-LB</h3>
 
     <p>
       This table is similar to ingress table <code>Pre-LB</code>.  It
@@ -2342,7 +2360,7 @@  output;
       logical router datapath from logical switch datapath for routing.
     </p>
 
-    <h3>Egress Table 2: Pre-stateful</h3>
+    <h3>Egress Table 4: Pre-stateful</h3>
 
     <p>
       This is similar to ingress table <code>Pre-stateful</code>.  This table
@@ -2372,12 +2390,12 @@  output;
       </li>
     </ul>
 
-    <h3>Egress Table 3: <code>from-lport</code> ACL hints</h3>
+    <h3>Egress Table 5: <code>from-lport</code> ACL hints</h3>
     <p>
       This is similar to ingress table <code>ACL hints</code>.
     </p>
 
-    <h3>Egress Table 4: <code>to-lport</code> ACL evaluation</h3>
+    <h3>Egress Table 6: <code>to-lport</code> ACL evaluation</h3>
 
     <p>
       This is similar to ingress table <code>ACL eval</code> except for
@@ -2435,31 +2453,31 @@  output;
       </li>
     </ul>
 
-    <h3>Egress Table 5: <code>to-lport</code> ACL sampling</h3>
+    <h3>Egress Table 7: <code>to-lport</code> ACL sampling</h3>
     <p>
       This is similar to ingress table <code>ACL sampling</code>.
     </p>
 
-    <h3>Egress Table 6: <code>to-lport</code> ACL action</h3>
+    <h3>Egress Table 8: <code>to-lport</code> ACL action</h3>
     <p>
       This is similar to ingress table <code>ACL action</code>.
     </p>
 
-    <h3>Egress Table 7: <code>to-lport</code> QoS</h3>
+    <h3>Egress Table 9: <code>to-lport</code> QoS</h3>
 
     <p>
       This is similar to ingress table <code>QoS</code> except
       they apply to <code>to-lport</code> QoS rules.
     </p>
 
-    <h3>Egress Table 8: Stateful</h3>
+    <h3>Egress Table 10: Stateful</h3>
 
     <p>
       This is similar to ingress table <code>Stateful</code> except that
       there are no rules added for load balancing new connections.
     </p>
 
-    <h3>Egress Table 9: Egress Port Security - check</h3>
+    <h3>Egress Table 11: Egress Port Security - check</h3>
 
     <p>
       This is similar to the port security logic in table
@@ -2488,7 +2506,7 @@  output;
       </li>
     </ul>
 
-    <h3>Egress Table 10: Egress Port Security - Apply</h3>
+    <h3>Egress Table 12: Egress Port Security - Apply</h3>
 
     <p>
       This is similar to the ingress port security logic in ingress table
diff --git a/tests/ovn-ic.at b/tests/ovn-ic.at
index 7eecbfc61..20968208e 100644
--- a/tests/ovn-ic.at
+++ b/tests/ovn-ic.at
@@ -2779,6 +2779,26 @@  ovn_as az3
 check ovn-nbctl --wait=hv sync
 ovn-sbctl dump-flows > az3/sbflows
 
+dnl Check that FDB learning is enabled for remote ports in the egress pipeline.
+AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az1/sbflows | ovn_strip_lflows], [0], [dnl
+  table=??(ls_out_lookup_fdb  ), priority=100  , match=(inport == "spine-to-ls2"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+  table=??(ls_out_lookup_fdb  ), priority=100  , match=(inport == "spine-to-ls3"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+  table=??(ls_out_put_fdb     ), priority=100  , match=(inport == "spine-to-ls2" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+  table=??(ls_out_put_fdb     ), priority=100  , match=(inport == "spine-to-ls3" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+])
+AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az2/sbflows | ovn_strip_lflows], [0], [dnl
+  table=??(ls_out_lookup_fdb  ), priority=100  , match=(inport == "spine-to-ls1"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+  table=??(ls_out_lookup_fdb  ), priority=100  , match=(inport == "spine-to-ls3"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+  table=??(ls_out_put_fdb     ), priority=100  , match=(inport == "spine-to-ls1" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+  table=??(ls_out_put_fdb     ), priority=100  , match=(inport == "spine-to-ls3" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+])
+AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az3/sbflows | ovn_strip_lflows], [0], [dnl
+  table=??(ls_out_lookup_fdb  ), priority=100  , match=(inport == "spine-to-ls1"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+  table=??(ls_out_lookup_fdb  ), priority=100  , match=(inport == "spine-to-ls2"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+  table=??(ls_out_put_fdb     ), priority=100  , match=(inport == "spine-to-ls1" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+  table=??(ls_out_put_fdb     ), priority=100  , match=(inport == "spine-to-ls2" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+])
+
 check ovn-ic-nbctl --wait=sb sync
 
 ovn-ic-nbctl show > ic-nbctl.dump
@@ -2836,15 +2856,13 @@  ovn_as az1
 wait_row_count FDB 1
 check ovn-nbctl --wait=hv sync
 
-# Only one entry is expected in the other zones as well - the entry for
-# the ls[23]-to-spine port in ls[23] switches.  Technically, we also need
-# an entry for a remote spine-to-ls1 port, but learning from remote ports
-# is not implemented yet.
+# Two entries are expected in the other zones - one for the remote port on
+# the spine switch and one for the switch port on the leaf.
 ovn_as az2
-wait_row_count FDB 1
+wait_row_count FDB 2
 check ovn-nbctl --wait=hv sync
 ovn_as az3
-wait_row_count FDB 1
+wait_row_count FDB 2
 check ovn-nbctl --wait=hv sync
 
 # FDB entry was created from the userspace() action in the datapath, but
@@ -2887,9 +2905,10 @@  OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected])
 reply=$(fmt_pkt "Ether(dst='${src_mac}', src='${dst_mac}')/ \
                  IP(src='${dst_ip}', dst='${src_ip}')/ \
                  UDP(sport=4369, dport=1538)")
-# Reply packet is still learned and broadcasted in the spine switch, because
-# learning from remote ports is not implemented, so we don't know where the
-# vif1 is located, even though we received some traffic from it.
+# For the reply packet we expect only one userspace action for FDB update on
+# the spine switch and only one tunnel push and send, because we already
+# learned that MAC of vif1 is behind spine-ls1 and no longer need to broadcast
+# to zone 3.
 AT_CHECK([ovn_as az2 as hv2 ovs-appctl ofproto/trace --names \
                 br-int in_port=vif3 $reply > ofproto-trace-2])
 AT_CAPTURE_FILE([ofproto-trace-2])
@@ -2899,9 +2918,6 @@  Megaflow: recirc_id=0,eth,ip,in_port=vif3,dl_src=f0:00:00:01:02:03,dl_dst=f0:00:
 AT_CHECK([cat ofproto-trace-2 | tail -1 \
             | grep -oE 'tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
 userspace
-clone
-tnl_push
-br-phys_n1
 tnl_push
 br-phys_n1
 ])
@@ -2909,24 +2925,57 @@  br-phys_n1
 # Now actually send it.
 check as hv2 ovs-appctl netdev-dummy/receive vif3 $reply
 
-# Zones 1 and 2 should have 2 FDB entries now each.  One per side of a
-# switch-switch port connecting ls[12] with the spine.  Zone 3 only has two
-# entries on ls3 for traffic broadcasted in the spine from both vif1 and vif3.
-ovn_as az1 wait_row_count FDB 2
-ovn_as az2 wait_row_count FDB 2
-ovn_as az3 wait_row_count FDB 2
-
 AT_CHECK([echo $reply > reply])
 # Check that it is delivered where needed and not delivered where not.
-# While the traffic is broadcasted within the spine and arrives in zone 3, the
-# packets must be dropped, because ls3 learned that their destination addresses
-# are behind the spine switch, so no new packets should be seen on vif5.
 OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [reply])
 OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty])
 OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
 OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty])
 OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected])
 
+# Zones 1 and 2 should have 2 FDB entries now each.  One for a remote port and
+# one per side of a switch-switch port connecting ls[12] with the spine.  Zone
+# 3 still only has two entries created for the original request from vif1.
+ovn_as az1
+wait_row_count FDB 3
+check ovn-nbctl --wait=hv sync
+ovn_as az2
+wait_row_count FDB 3
+check ovn-nbctl --wait=hv sync
+ovn_as az3
+wait_row_count FDB 2
+check ovn-nbctl --wait=hv sync
+
+# Packets should flow directly to the destination (via tunnels) in both
+# directions now.
+AT_CHECK([as hv1 ovs-appctl ofproto/trace --names \
+            br-int in_port=vif1 $packet | tail -2 \
+            | grep -oE 'Megaflow.*|tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
+Megaflow: recirc_id=0,eth,ip,in_port=vif1,dl_src=f0:00:00:01:02:01,dl_dst=f0:00:00:01:02:03,nw_ecn=0,nw_frag=no
+tnl_push
+br-phys_n1
+])
+AT_CHECK([as hv2 ovs-appctl ofproto/trace --names \
+            br-int in_port=vif3 $reply | tail -2 \
+            | grep -oE 'Megaflow.*|tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
+Megaflow: recirc_id=0,eth,ip,in_port=vif3,dl_src=f0:00:00:01:02:03,dl_dst=f0:00:00:01:02:01,nw_ecn=0,nw_frag=no
+tnl_push
+br-phys_n1
+])
+
+# Send and check one more time.
+check as hv1 ovs-appctl netdev-dummy/receive vif1 $packet
+check as hv2 ovs-appctl netdev-dummy/receive vif3 $reply
+
+AT_CHECK([cp expected expected-vif5])
+AT_CHECK([echo $packet >> expected])
+AT_CHECK([echo $reply >> reply])
+OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [reply])
+OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty])
+OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
+OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty])
+OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected-vif5])
+
 OVN_CLEANUP_IC([az1], [az2], [az3])
 AT_CLEANUP
 ])