@@ -2219,7 +2219,9 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out)
}
case LP_REMOTE:
- /* Nothing to be done for REMOTE type. */
+ /* Remote ports are related, since we can have rules in the
+ * egress pipeline matching on traffic coming from another AZ. */
+ update_related_lport(pb, b_ctx_out);
break;
case LP_UNKNOWN: {
@@ -3131,6 +3133,9 @@ handle_updated_port(struct binding_ctx_in *b_ctx_in,
}
case LP_REMOTE:
+ update_related_lport(pb, b_ctx_out);
+ break;
+
case LP_UNKNOWN:
break;
}
@@ -5763,7 +5763,9 @@ build_lswitch_learn_fdb_op(
return;
}
- if (!strcmp(op->nbsp->type, "") || lsp_is_switch(op->nbsp)
+ bool remote = lsp_is_remote(op->nbsp);
+
+ if (remote || !strcmp(op->nbsp->type, "") || lsp_is_switch(op->nbsp)
|| (lsp_is_localnet(op->nbsp) && localnet_can_learn_mac(op->nbsp))) {
ds_clear(match);
ds_clear(actions);
@@ -5774,7 +5776,9 @@ build_lswitch_learn_fdb_op(
ds_put_format(actions, REGBIT_LKUP_FDB
" = lookup_fdb(inport, eth.src); next;");
ovn_lflow_add_with_lport_and_hint(lflows, op->od,
- S_SWITCH_IN_LOOKUP_FDB, 100,
+ remote ? S_SWITCH_OUT_LOOKUP_FDB
+ : S_SWITCH_IN_LOOKUP_FDB,
+ 100,
ds_cstr(match), ds_cstr(actions),
op->key, &op->nbsp->header_,
op->lflow_ref);
@@ -5782,7 +5786,9 @@ build_lswitch_learn_fdb_op(
ds_put_cstr(match, " && "REGBIT_LKUP_FDB" == 0");
ds_clear(actions);
ds_put_cstr(actions, "put_fdb(inport, eth.src); next;");
- ovn_lflow_add_with_lport_and_hint(lflows, op->od, S_SWITCH_IN_PUT_FDB,
+ ovn_lflow_add_with_lport_and_hint(lflows, op->od,
+ remote ? S_SWITCH_OUT_PUT_FDB
+ : S_SWITCH_IN_PUT_FDB,
100, ds_cstr(match),
ds_cstr(actions), op->key,
&op->nbsp->header_,
@@ -5802,6 +5808,11 @@ build_lswitch_learn_fdb_od(
lflow_ref);
ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 0, "1",
"outport = get_fdb(eth.dst); next;", lflow_ref);
+
+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_LOOKUP_FDB, 0, "1", "next;",
+ lflow_ref);
+ ovn_lflow_add(lflows, od, S_SWITCH_OUT_PUT_FDB, 0, "1", "next;",
+ lflow_ref);
}
/* Egress tables 8: Egress port security - IP (priority 0)
@@ -498,17 +498,19 @@ enum ovn_stage {
PIPELINE_STAGE(SWITCH, IN, L2_UNKNOWN, 29, "ls_in_l2_unknown") \
\
/* Logical switch egress stages. */ \
- PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 0, "ls_out_pre_acl") \
- PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 1, "ls_out_pre_lb") \
- PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful") \
- PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 3, "ls_out_acl_hint") \
- PIPELINE_STAGE(SWITCH, OUT, ACL_EVAL, 4, "ls_out_acl_eval") \
- PIPELINE_STAGE(SWITCH, OUT, ACL_SAMPLE, 5, "ls_out_acl_sample") \
- PIPELINE_STAGE(SWITCH, OUT, ACL_ACTION, 6, "ls_out_acl_action") \
- PIPELINE_STAGE(SWITCH, OUT, QOS, 7, "ls_out_qos") \
- PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 8, "ls_out_stateful") \
- PIPELINE_STAGE(SWITCH, OUT, CHECK_PORT_SEC, 9, "ls_out_check_port_sec") \
- PIPELINE_STAGE(SWITCH, OUT, APPLY_PORT_SEC, 10, "ls_out_apply_port_sec") \
+ PIPELINE_STAGE(SWITCH, OUT, LOOKUP_FDB, 0, "ls_out_lookup_fdb") \
+ PIPELINE_STAGE(SWITCH, OUT, PUT_FDB, 1, "ls_out_put_fdb") \
+ PIPELINE_STAGE(SWITCH, OUT, PRE_ACL, 2, "ls_out_pre_acl") \
+ PIPELINE_STAGE(SWITCH, OUT, PRE_LB, 3, "ls_out_pre_lb") \
+ PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 4, "ls_out_pre_stateful") \
+ PIPELINE_STAGE(SWITCH, OUT, ACL_HINT, 5, "ls_out_acl_hint") \
+ PIPELINE_STAGE(SWITCH, OUT, ACL_EVAL, 6, "ls_out_acl_eval") \
+ PIPELINE_STAGE(SWITCH, OUT, ACL_SAMPLE, 7, "ls_out_acl_sample") \
+ PIPELINE_STAGE(SWITCH, OUT, ACL_ACTION, 8, "ls_out_acl_action") \
+ PIPELINE_STAGE(SWITCH, OUT, QOS, 9, "ls_out_qos") \
+ PIPELINE_STAGE(SWITCH, OUT, STATEFUL, 10, "ls_out_stateful") \
+ PIPELINE_STAGE(SWITCH, OUT, CHECK_PORT_SEC, 11, "ls_out_check_port_sec") \
+ PIPELINE_STAGE(SWITCH, OUT, APPLY_PORT_SEC, 12, "ls_out_apply_port_sec") \
\
/* Logical router ingress stages. */ \
PIPELINE_STAGE(ROUTER, IN, ADMISSION, 0, "lr_in_admission") \
@@ -2280,7 +2280,25 @@ output;
</li>
</ul>
- <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
+ <h3>Egress Table 0: Lookup MAC address learning table</h3>
+ <p>
+ This is similar to ingress table <code>Lookup MAC address learning table
+ </code> with the difference that MAC address learning lookup is only
+ happening for ports with type <code>remote</code> whose port security is
+ disabled and 'unknown' address set. This stage facilitates MAC learning
+ on a transit switch connecting multiple availability zones.
+ </p>
+
+ <h3>Egress Table 1: Learn MAC of 'unknown' ports.</h3>
+ <p>
+ This is similar to ingress table <code>Learn MAC of 'unknown' ports
+ </code> with the difference that MAC address learning is only happening
+ for ports with type <code>remote</code> whose port security is disabled
+ and 'unknown' address set. This stage facilitates MAC learning on a
+ transit switch connecting multiple availability zones.
+ </p>
+
+ <h3>Egress Table 2: <code>to-lport</code> Pre-ACLs</h3>
<p>
This is similar to ingress table <code>Pre-ACLs</code> except for
@@ -2307,7 +2325,7 @@ output;
</p>
- <h3>Egress Table 1: Pre-LB</h3>
+ <h3>Egress Table 3: Pre-LB</h3>
<p>
This table is similar to ingress table <code>Pre-LB</code>. It
@@ -2342,7 +2360,7 @@ output;
logical router datapath from logical switch datapath for routing.
</p>
- <h3>Egress Table 2: Pre-stateful</h3>
+ <h3>Egress Table 4: Pre-stateful</h3>
<p>
This is similar to ingress table <code>Pre-stateful</code>. This table
@@ -2372,12 +2390,12 @@ output;
</li>
</ul>
- <h3>Egress Table 3: <code>from-lport</code> ACL hints</h3>
+ <h3>Egress Table 5: <code>from-lport</code> ACL hints</h3>
<p>
This is similar to ingress table <code>ACL hints</code>.
</p>
- <h3>Egress Table 4: <code>to-lport</code> ACL evaluation</h3>
+ <h3>Egress Table 6: <code>to-lport</code> ACL evaluation</h3>
<p>
This is similar to ingress table <code>ACL eval</code> except for
@@ -2435,31 +2453,31 @@ output;
</li>
</ul>
- <h3>Egress Table 5: <code>to-lport</code> ACL sampling</h3>
+ <h3>Egress Table 7: <code>to-lport</code> ACL sampling</h3>
<p>
This is similar to ingress table <code>ACL sampling</code>.
</p>
- <h3>Egress Table 6: <code>to-lport</code> ACL action</h3>
+ <h3>Egress Table 8: <code>to-lport</code> ACL action</h3>
<p>
This is similar to ingress table <code>ACL action</code>.
</p>
- <h3>Egress Table 7: <code>to-lport</code> QoS</h3>
+ <h3>Egress Table 9: <code>to-lport</code> QoS</h3>
<p>
This is similar to ingress table <code>QoS</code> except
they apply to <code>to-lport</code> QoS rules.
</p>
- <h3>Egress Table 8: Stateful</h3>
+ <h3>Egress Table 10: Stateful</h3>
<p>
This is similar to ingress table <code>Stateful</code> except that
there are no rules added for load balancing new connections.
</p>
- <h3>Egress Table 9: Egress Port Security - check</h3>
+ <h3>Egress Table 11: Egress Port Security - check</h3>
<p>
This is similar to the port security logic in table
@@ -2488,7 +2506,7 @@ output;
</li>
</ul>
- <h3>Egress Table 10: Egress Port Security - Apply</h3>
+ <h3>Egress Table 12: Egress Port Security - Apply</h3>
<p>
This is similar to the ingress port security logic in ingress table
@@ -2779,6 +2779,26 @@ ovn_as az3
check ovn-nbctl --wait=hv sync
ovn-sbctl dump-flows > az3/sbflows
+dnl Check that FDB learning is enabled for remote ports in the egress pipeline.
+AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az1/sbflows | ovn_strip_lflows], [0], [dnl
+ table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls2"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+ table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls3"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+ table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls2" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+ table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls3" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+])
+AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az2/sbflows | ovn_strip_lflows], [0], [dnl
+ table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls1"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+ table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls3"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+ table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls1" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+ table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls3" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+])
+AT_CHECK([grep -E "ls_out.*fdb.*spine-to-" az3/sbflows | ovn_strip_lflows], [0], [dnl
+ table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls1"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+ table=??(ls_out_lookup_fdb ), priority=100 , match=(inport == "spine-to-ls2"), action=(reg0[[11]] = lookup_fdb(inport, eth.src); next;)
+ table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls1" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+ table=??(ls_out_put_fdb ), priority=100 , match=(inport == "spine-to-ls2" && reg0[[11]] == 0), action=(put_fdb(inport, eth.src); next;)
+])
+
check ovn-ic-nbctl --wait=sb sync
ovn-ic-nbctl show > ic-nbctl.dump
@@ -2836,15 +2856,13 @@ ovn_as az1
wait_row_count FDB 1
check ovn-nbctl --wait=hv sync
-# Only one entry is expected in the other zones as well - the entry for
-# the ls[23]-to-spine port in ls[23] switches. Technically, we also need
-# an entry for a remote spine-to-ls1 port, but learning from remote ports
-# is not implemented yet.
+# Two entries are expected in the other zones - one for the remote port on
+# the spine switch and one for the switch port on the leaf.
ovn_as az2
-wait_row_count FDB 1
+wait_row_count FDB 2
check ovn-nbctl --wait=hv sync
ovn_as az3
-wait_row_count FDB 1
+wait_row_count FDB 2
check ovn-nbctl --wait=hv sync
# FDB entry was created from the userspace() action in the datapath, but
@@ -2887,9 +2905,10 @@ OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected])
reply=$(fmt_pkt "Ether(dst='${src_mac}', src='${dst_mac}')/ \
IP(src='${dst_ip}', dst='${src_ip}')/ \
UDP(sport=4369, dport=1538)")
-# Reply packet is still learned and broadcasted in the spine switch, because
-# learning from remote ports is not implemented, so we don't know where the
-# vif1 is located, even though we received some traffic from it.
+# For the reply packet we expect only one userspace action for FDB update on
+# the spine switch and only one tunnel push and send, because we already
+# learned that MAC of vif1 is behind spine-ls1 and no longer need to broadcast
+# to zone 3.
AT_CHECK([ovn_as az2 as hv2 ovs-appctl ofproto/trace --names \
br-int in_port=vif3 $reply > ofproto-trace-2])
AT_CAPTURE_FILE([ofproto-trace-2])
@@ -2899,9 +2918,6 @@ Megaflow: recirc_id=0,eth,ip,in_port=vif3,dl_src=f0:00:00:01:02:03,dl_dst=f0:00:
AT_CHECK([cat ofproto-trace-2 | tail -1 \
| grep -oE 'tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
userspace
-clone
-tnl_push
-br-phys_n1
tnl_push
br-phys_n1
])
@@ -2909,24 +2925,57 @@ br-phys_n1
# Now actually send it.
check as hv2 ovs-appctl netdev-dummy/receive vif3 $reply
-# Zones 1 and 2 should have 2 FDB entries now each. One per side of a
-# switch-switch port connecting ls[12] with the spine. Zone 3 only has two
-# entries on ls3 for traffic broadcasted in the spine from both vif1 and vif3.
-ovn_as az1 wait_row_count FDB 2
-ovn_as az2 wait_row_count FDB 2
-ovn_as az3 wait_row_count FDB 2
-
AT_CHECK([echo $reply > reply])
# Check that it is delivered where needed and not delivered where not.
-# While the traffic is broadcasted within the spine and arrives in zone 3, the
-# packets must be dropped, because ls3 learned that their destination addresses
-# are behind the spine switch, so no new packets should be seen on vif5.
OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [reply])
OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty])
OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty])
OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected])
+# Zones 1 and 2 should have 2 FDB entries now each. One for a remote port and
+# one per side of a switch-switch port connecting ls[12] with the spine. Zone
+# 3 still only has two entries created for the original request from vif1.
+ovn_as az1
+wait_row_count FDB 3
+check ovn-nbctl --wait=hv sync
+ovn_as az2
+wait_row_count FDB 3
+check ovn-nbctl --wait=hv sync
+ovn_as az3
+wait_row_count FDB 2
+check ovn-nbctl --wait=hv sync
+
+# Packets should flow directly to the destination (via tunnels) in both
+# directions now.
+AT_CHECK([as hv1 ovs-appctl ofproto/trace --names \
+ br-int in_port=vif1 $packet | tail -2 \
+ | grep -oE 'Megaflow.*|tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
+Megaflow: recirc_id=0,eth,ip,in_port=vif1,dl_src=f0:00:00:01:02:01,dl_dst=f0:00:00:01:02:03,nw_ecn=0,nw_frag=no
+tnl_push
+br-phys_n1
+])
+AT_CHECK([as hv2 ovs-appctl ofproto/trace --names \
+ br-int in_port=vif3 $reply | tail -2 \
+ | grep -oE 'Megaflow.*|tnl_push|userspace|clone|br-phys_n1|vif[[0-9]]'], [0], [dnl
+Megaflow: recirc_id=0,eth,ip,in_port=vif3,dl_src=f0:00:00:01:02:03,dl_dst=f0:00:00:01:02:01,nw_ecn=0,nw_frag=no
+tnl_push
+br-phys_n1
+])
+
+# Send and check one more time.
+check as hv1 ovs-appctl netdev-dummy/receive vif1 $packet
+check as hv2 ovs-appctl netdev-dummy/receive vif3 $reply
+
+AT_CHECK([cp expected expected-vif5])
+AT_CHECK([echo $packet >> expected])
+AT_CHECK([echo $reply >> reply])
+OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [reply])
+OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [empty])
+OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
+OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [empty])
+OVN_CHECK_PACKETS([hv3/vif5-tx.pcap], [expected-vif5])
+
OVN_CLEANUP_IC([az1], [az2], [az3])
AT_CLEANUP
])
Currently, OVN can't learn FDB entries from remote ports, because learning is happening in the ingress pipeline, while only the egress pipeline is executed in the destination availability zone. So, the destination zone can't learn the source MAC address and has to broadcast replies. The source zone also can't learn from replies, so has to broadcast all the subsequent packets as well. Fix that by introducing FDB learning in the egress pipeline for remote ports. Note: For some reason remote ports were not tracked by ovn-controller. This wasn't a big problem because we didn't use inport/outport matches on the traffic from remote ports in most cases. However, FDB lookups need to match on the input port, and ovn-controller will skip those flows if ports are not related, so marking them as such. (The removal is already handled in a generic way.) Fixes: 67100f0ca819 ("ic: Add support for spine-leaf topology for transit switches.") Signed-off-by: Ilya Maximets <i.maximets@ovn.org> --- controller/binding.c | 7 +++- northd/northd.c | 17 ++++++-- northd/northd.h | 24 ++++++----- northd/ovn-northd.8.xml | 40 +++++++++++++----- tests/ovn-ic.at | 93 +++++++++++++++++++++++++++++++---------- 5 files changed, 133 insertions(+), 48 deletions(-)