diff mbox series

[ovs-dev,RFC,v2] northd: Fix routing loop in LRs with one-to-many SNAT

Message ID 20210816085206.69170-1-kklimonda@syntaxhighlighted.com
State New
Headers show
Series [ovs-dev,RFC,v2] northd: Fix routing loop in LRs with one-to-many SNAT | expand

Commit Message

Krzysztof Klimonda Aug. 16, 2021, 8:52 a.m. UTC
If there are snat entries on the router, and some logical_ip are set to
network instead of an IP address then given SNAT is masquerade. In such
case ct_snat action is used in lr_in_unsnat table to ensure that the
packet is matched against conntrack and destination IP is replaced with
one from matching conntrack entry.

This however breaks down for new connections sent to router's external IP
address. In such case, when packet is checked against conntrack table,
there is no match, and its destination IP remains unchanged. This causes a
loop in lr_in_ip_routing.

This commit installs a new logical flow in lr_in_ip_routing table for routers
that have SNAT entry with logical_ip set to network (that being masquerade).
This flow drops packets that, after going through conntrack via ct_snat action
in lr_in_unsnat table, are not matched to any existing flow and retain the
original destination IP. This prevents vswitchd from looping such packets until
their TTL reaches zero, as well as installing bogus flows in datapath that lead
to ovs module dropping such packets with "deferred action limit reached, drop
recirc action" message.

Signed-off-by: Krzysztof Klimonda <kklimonda@syntaxhighlighted.com>
---
There are two FIXME that should be handled:
- I tried dropping ct.new from the flow, and that breaks E/W SNAT/DNAT
  traffic (DNAT and SNAT on distributed router - E/W -- ovn-northd). See
  comment in ovn-northd.c/ovn_northd.dl
- I can't get my testcase pass with the master - when pinging router gateway
  IP first packet is seemlingly dropped, with the following warning logged
  in vswitchd:
  "Failed to acquire udpif_key corresponding to unexpected flow"

v1 -> v2
* Rebased patch on current master
* Reworked logic to check if nat->type is snat and masquerade
* Added loadbalancer-related check to system-userspace testcase
* Added ddlog implementation

 northd/ovn-northd.c  |  72 +++++++++++++++++++++++--
 northd/ovn_northd.dl |  53 +++++++++++++++++++
 tests/ovn.at         |  53 +++++++++++++++++++
 tests/system-ovn.at  | 122 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 295 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 52fc255ae..517357884 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -12250,8 +12250,9 @@  build_lrouter_ingress_flow(struct hmap *lflows, struct ovn_datapath *od,
 
 static int
 lrouter_check_nat_entry(struct ovn_datapath *od, const struct nbrec_nat *nat,
-                        ovs_be32 *mask, bool *is_v6, int *cidr_bits,
-                        struct eth_addr *mac, bool *distributed)
+                        ovs_be32 *mask, bool *is_v6, bool *is_masq,
+                        int *cidr_bits, struct eth_addr *mac,
+                        bool *distributed)
 {
     struct in6_addr ipv6, mask_v6, v6_exact = IN6ADDR_EXACT_INIT;
     ovs_be32 ip;
@@ -12285,14 +12286,21 @@  lrouter_check_nat_entry(struct ovn_datapath *od, const struct nbrec_nat *nat,
         *is_v6 = true;
     }
 
+    *is_masq = false;
     /* Check the validity of nat->logical_ip. 'logical_ip' can
     * be a subnet when the type is "snat". */
     if (*is_v6) {
         error = ipv6_parse_masked(nat->logical_ip, &ipv6, &mask_v6);
         *cidr_bits = ipv6_count_cidr_bits(&mask_v6);
+        if (*cidr_bits < 128) {
+            *is_masq = true;
+        }
     } else {
         error = ip_parse_masked(nat->logical_ip, &ip, mask);
         *cidr_bits = ip_count_cidr_bits(*mask);
+        if (*cidr_bits < 32) {
+            *is_masq = true;
+        }
     }
     if (!strcmp(nat->type, "snat")) {
         if (error) {
@@ -12397,12 +12405,12 @@  build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, struct hmap *lflows,
     for (int i = 0; i < od->nbr->n_nat; i++) {
         const struct nbrec_nat *nat = nat = od->nbr->nat[i];
         struct eth_addr mac = eth_addr_broadcast;
-        bool is_v6, distributed;
+        bool is_v6, is_masq, distributed;
         ovs_be32 mask;
         int cidr_bits;
 
-        if (lrouter_check_nat_entry(od, nat, &mask, &is_v6, &cidr_bits,
-                                    &mac, &distributed) < 0) {
+        if (lrouter_check_nat_entry(od, nat, &mask, &is_v6, &is_masq,
+                                    &cidr_bits, &mac, &distributed) < 0) {
             continue;
         }
 
@@ -12413,6 +12421,60 @@  build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od, struct hmap *lflows,
         build_lrouter_in_dnat_flow(lflows, od, nat, match, actions, distributed,
                                    mask, is_v6);
 
+        /* When router have SNAT enabled, and logical_ip is a network (router
+         * is doing masquerade), then we need to make sure that packets
+         * unrelated to any established connection that still have router's
+         * external IP as a next hop after going through lr_in_unsnat table
+         * are dropped properly. Otherwise such packets will loop around
+         * between tables until their ttl reaches zero - this additionally
+         * causes kernel module to drop such packages due to recirculation
+         * limit being exceeded.
+         *
+         * Install a logical flow in lr_in_ip_routing table that will
+         * drop packet with router's external IP as its destination unless
+         * they are part of an existing conntrack connection. Match on conntrack
+         * is needed to keep E/W NAT working properly where router gatewat IP
+         * becomes a source in SNAT->DNAT+SNAT scenario:
+         *
+         * foo1(192.168.1.2) ->
+         * router (snat 192.168.1.0/24 to 172.16.1.1) ->
+         * bar1(192.168.2.2 with dnat_and_snat to 172.16.1.4 on the router)
+         *
+         * In this scenario traffic from foo1 to 172.16.1.4 is first SNAT
+         * on router, it's source IP being replaced with 172.16.1.1 before
+         * being forwarded to bar1. The return traffic has 172.16.1.1 as
+         * destination IP (because of SNAT) and routing table is dropping
+         * packet.
+         *
+         * The priority for destination routes is calculated as
+         * (prefix length * 2) + 1, and there is an additional flow
+         * for when BFD is in play with priority + 1. Set priority that
+         * is higher than any other potential routing flow for that
+         * network, that is (prefix length * 2) + offset, where offset
+         * is 1 (dst) + 1 (bfd) + 1. */
+        if (!strcmp(nat->type, "snat") && is_masq) {
+            uint16_t priority, prefix_length, offset;
+            prefix_length = is_v6 ? 128 : 32;
+            offset = 3;
+            priority = (prefix_length * 2) + offset;
+
+            ds_clear(match);
+
+            /* FIXME(kklimonda): why isn't ip.dst (in E/W SNAT case) not
+             * replaced by the time returning packet lands in
+             * lr_in_ip_routing table?
+            /* match on ct.new to pass through E/W SNAT traffic */
+            ds_put_format(match,
+                          "ct.new && ip%s.dst == %s",
+                          is_v6 ? "6" : "4",
+                          nat->external_ip);
+
+            ovn_lflow_add_with_hint(lflows, od,
+                                    S_ROUTER_IN_IP_ROUTING, priority,
+                                    ds_cstr(match), "drop;",
+                                    &nat->header_);
+        }
+
         /* ARP resolve for NAT IPs. */
         if (od->is_gw_router) {
             /* Add the NAT external_ip to the nat_entries for
diff --git a/northd/ovn_northd.dl b/northd/ovn_northd.dl
index a94726351..46f43aa0d 100644
--- a/northd/ovn_northd.dl
+++ b/northd/ovn_northd.dl
@@ -6013,6 +6013,59 @@  for (r in &Router(._uuid = lr_uuid,
                 }
             };
 
+            /* When router have SNAT enabled, and logical_ip is a network (router
+             * is doing masquerade), then we need to make sure that packets
+             * unrelated to any established connection that still have router's
+             * external IP as a next hop after going through lr_in_unsnat table
+             * are dropped properly. Otherwise such packets will loop around
+             * between tables until their ttl reaches zero - this additionally
+             * causes kernel module to drop such packages due to recirculation
+             * limit being exceeded.
+             *
+             * Install a logical flow in lr_in_ip_routing table that will
+             * drop packet with router's external IP as its destination unless
+             * they are part of an existing conntrack connection. Match on conntrack
+             * is needed to keep E/W NAT working properly where router gatewat IP
+             * becomes a source in SNAT->DNAT+SNAT scenario:
+             *
+             * foo1(192.168.1.2) ->
+             * router (snat 192.168.1.0/24 to 172.16.1.1) ->
+             * bar1(192.168.2.2 with dnat_and_snat to 172.16.1.4 on the router)
+             *
+             * In this scenario traffic from foo1 to 172.16.1.4 is first SNAT
+             * on router, it's source IP being replaced with 172.16.1.1 before
+             * being forwarded to bar1. The return traffic has 172.16.1.1 as
+             * destination IP (because of SNAT) and routing table is dropping
+             * packet.
+             *
+             * The priority for destination routes is calculated as
+             * (prefix length * 2) + 1, and there is an additional flow
+             * for when BFD is in play with priority + 1. Set priority that
+             * is higher than any other potential routing flow for that
+             * network, that is (prefix length * 2) + offset, where offset
+             * is 1 (dst) + 1 (bfd) + 1. */
+            Some{var plen} = mask.cidr_bits() in
+            var is_masq = match(nat.external_ip) {
+                IPv4{_} -> {if ((plen as integer) < 32) { true } else { false }},
+                IPv6{_} -> {if ((plen as integer) < 128) { true } else { false }},
+            } in
+            if (nat.nat.__type == "snat" and is_masq) {
+                var prefix_length = match (nat.external_ip) {
+                    IPv4{_} -> 32,
+                    IPv6{_} -> 128
+                } in
+                var offset = 3 in
+                var priority = (prefix_length * 2) + offset in
+                var __match = "ct.new"
+                " && ${ipX}.dst == ${nat.nat.external_ip}" in
+                Flow(.logical_datapath = lr_uuid,
+                     .stage            = s_ROUTER_IN_IP_ROUTING(),
+                     .priority         = priority,
+                     .__match          = __match,
+                     .actions          = "drop;",
+                     .external_ids     = stage_hint(nat.nat._uuid))
+            };
+
             /* Ingress DNAT table: Packets enter the pipeline with destination
              * IP address that needs to be DNATted from a external IP address
              * to a logical IP address. */
diff --git a/tests/ovn.at b/tests/ovn.at
index 7144b35fe..4bab8833a 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -24498,6 +24498,59 @@  OVN_CLEANUP([hv1])
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn -- SNAT handling of unrelated packets with masquerade"])
+ovn_start
+
+ovn-nbctl lr-add R1
+
+ovn-nbctl ls-add sw0
+ovn-nbctl ls-add public
+
+ovn-nbctl lsp-add sw0 sw0-port1 \
+          -- lsp-set-addresses sw0-port1 "00:00:00:01:02:03 192.168.1.3 1921::3"
+
+ovn-nbctl lrp-add R1 rp-sw0 00:00:01:01:02:03 192.168.1.1/24
+ovn-nbctl lrp-add R1 rp-public 00:00:02:01:02:03 172.16.1.254/24 1921::1/64 \
+    -- lrp-set-gateway-chassis rp-public hv1
+
+ovn-nbctl lsp-add sw0 sw0-rp -- set Logical_Switch_Port sw0-rp \
+    type=router options:router-port=rp-sw0 \
+    -- lsp-set-addresses sw0-rp router
+
+ovn-nbctl lsp-add public public-rp -- set Logical_Switch_Port public-rp \
+    type=router options:router-port=rp-public \
+    -- lsp-set-addresses public-rp router
+
+AT_CHECK([ovn-nbctl lr-nat-add R1 dnat_and_snat 172.16.1.2 192.168.1.3 sw0-port1 00:00:00:01:02:03])
+AT_CHECK([ovn-nbctl lr-nat-add R1 dnat 172.16.1.3 192.168.1.4])
+AT_CHECK([check ovn-nbctl --wait=sb sync])
+
+# Before we add second NAT (masquerade) there should be no flows
+# in lr_in_ip_routing that match against conntrack state.
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_routing.*ct.new"], [1], [])
+
+ovn-nbctl lr-nat-add R1 snat 172.16.1.254 192.168.1.0/24
+ovn-nbctl lr-nat-add R1 snat 1921::1 1921::0/64
+
+AT_CHECK([check ovn-nbctl --wait=sb sync])
+
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_routing.*172.16.1.254.*drop"],
+[0], [dnl
+  table=10(lr_in_ip_routing   ), priority=67   dnl
+, match=(ip4.dst == 172.16.1.254), dnl
+action=(drop;)
+])
+
+AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_ip_routing.*1921.*drop"],
+[0], [dnl
+  table=10(lr_in_ip_routing   ), priority=259  dnl
+, match=(ip6.dst == 1921::1), action=(drop;)
+])
+
+AT_CLEANUP
+])
+
 OVN_FOR_EACH_NORTHD([
 AT_SETUP([ARP replies for SNAT external ips])
 ovn_start
diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index aadd68634..8cb98c4c5 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -5966,6 +5966,128 @@  OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([SNAT handling of unrelated packets with masquerade"])
+AT_SKIP_IF([test $HAVE_NC = no])
+
+ovn_start
+OVS_TRAFFIC_VSWITCHD_START()
+
+ADD_BR([br-int])
+ADD_BR([br-ext])
+
+ovs-ofctl add-flow br-ext action=normal
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+ovn-nbctl lr-add R1
+
+ovn-nbctl ls-add sw0
+ovn-nbctl ls-add public
+
+ovn-nbctl lrp-add R1 rp-sw0 00:00:01:01:02:03 192.168.1.1/24
+ovn-nbctl lrp-add R1 rp-public 00:00:02:01:02:03 172.16.1.254/24 \
+    -- lrp-set-gateway-chassis rp-public hv1
+
+ovn-nbctl lsp-add sw0 sw0-rp -- set Logical_Switch_Port sw0-rp \
+    type=router options:router-port=rp-sw0 \
+    -- lsp-set-addresses sw0-rp router
+
+ovn-nbctl lsp-add public public-rp -- set Logical_Switch_Port public-rp \
+    type=router options:router-port=rp-public \
+    -- lsp-set-addresses public-rp router
+
+ovn-nbctl lr-nat-add R1 snat 172.16.1.254 192.168.1.0/24
+
+ovn-nbctl lb-add lb1 172.16.1.254:90 192.168.1.2:90
+ovn-nbctl lr-lb-add R1 lb1
+
+ADD_NAMESPACES(sw01-x)
+ADD_VETH(sw01-x, sw01-x, br-int, "192.168.1.2/24", "f0:00:00:01:02:03", \
+         "192.168.1.1")
+ovn-nbctl lsp-add sw0 sw01-x \
+    -- lsp-set-addresses sw01-x "f0:00:00:01:02:03 192.168.1.2"
+
+OVS_WAIT_UNTIL([test x$(ovn-nbctl lsp-get-up sw01-x) = xup])
+
+ADD_NAMESPACES(ext-foo)
+ADD_VETH(ext-foo, ext-foo, br-ext, "172.16.1.100/24", "00:10:10:01:02:13", \
+         "172.16.1.254")
+
+OVS_WAIT_UNTIL([test "$(ip netns exec ext-foo ip a | grep 172.16 | grep tentative)" = ""])
+
+AT_CHECK([ovs-vsctl set Open_vSwitch . external-ids:ovn-bridge-mappings=phynet:br-ext])
+ovn-nbctl lsp-add public public1 \
+        -- lsp-set-addresses public1 unknown \
+        -- lsp-set-type public1 localnet \
+        -- lsp-set-options public1 network_name=phynet
+
+ovn-nbctl --wait=hv sync
+
+# Send ping from ext-foo to router's external IP to verify that incoming ICMP is working
+# FIXME(kklimonda): first packet from this ping call fails with EAGAIN error
+# vswitchd logs the following for the first packet:
+# Failed to acquire udpif_key corresponding to unexpected flow (Invalid argument): ufid:[uuid]
+NS_CHECK_EXEC([ext-foo], [ping -q -c 3 -i 0.3 -w 2 172.16.1.254 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Verify that we can establish an outgoing connection from host behind
+# SNAT to one on the external network.
+NS_CHECK_EXEC([ext-foo], [echo response | nc -l 80 &], [], [])
+
+NS_CHECK_EXEC([sw01-x], [nc -w 1 172.16.1.100 80], [0], [dnl
+response
+])
+
+# Check that incoming connections to the loadbalancer port are being
+# forwared to the backend properly.
+NS_CHECK_EXEC([sw01-x], [echo response | nc -l 90 &], [], [])
+
+NS_CHECK_EXEC([ext-foo], [nc -w 1 172.16.1.254 90], [0], [dnl
+response
+])
+
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Check that incoming packets that are not part of any established
+# connection are dropped.
+NS_CHECK_EXEC([ext-foo], [nc -w 1 172.16.1.254 80], [1], [])
+
+# There shouldn't be any conntrack entry for the gateway IP
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.1.254)], [0], [])
+
+# vswitchd should not complaing about recirculation depth being exceeded,
+# which would indicate that packet is not properly dropped, but instead
+# loops between vswitchd flow tables until its ttl is down to 0.
+AT_CHECK([grep -qE 'Packet dropped. Max recirculation depth exceeded.' ovs-vswitchd.log], [1])
+
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+as ovn-sb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as ovn-nb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as northd
+OVS_APP_EXIT_AND_WAIT([NORTHD_TYPE])
+
+as
+OVS_TRAFFIC_VSWITCHD_STOP(["/.*error receiving.*/d
+/.*terminating with signal 15.*/d"])
+AT_CLEANUP
+])
+
 OVN_FOR_EACH_NORTHD([
 AT_SETUP([ARP resolution for SNAT IP])
 ovn_start