[ovs-dev,v2,6/6] ovn: Generate ICMPv4 packet in router pipeline for larger packets

Message ID 20190412130559.28693-1-nusiddiq@redhat.com
State Superseded
Headers show
Series
  • Address MTU issue for larger packets in OVN
Related show

Commit Message

Numan Siddique April 12, 2019, 1:05 p.m.
From: Numan Siddique <nusiddiq@redhat.com>

This patch adds 2 stages in router pipeline after ARP_RESOLVE
and adds the logical flows to check the packet length and
generate ICMPv4 packet.

   * S_ROUTER_IN_CHK_PKT_LEN - Which checks the packet length using
                               check_pkt_larger OVN action

   * S_ROUTER_IN_LARGER_PKTS - Which generates icmp packet with
                               type 3 (Destination Unreachable),
                               code 4 (Frag Needed and DF was Set)
                               icmp4.frag_mtu = gw_mtu

In order to add these logical flows, CMS should set the
option 'gateway_mtu' for the distributed logical router port.

Signed-off-by: Numan Siddique <nusiddiq@redhat.com>
Acked-by: Mark Michelson <mmichels@redhat.com>
---
 ovn/northd/ovn-northd.8.xml |  83 +++++++++++++++++-
 ovn/northd/ovn-northd.c     |  92 +++++++++++++++++++-
 tests/ovn.at                | 167 ++++++++++++++++++++++++++++++++++++
 3 files changed, 336 insertions(+), 6 deletions(-)

Patch

diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
index 392a5efc9..345023eb4 100644
--- a/ovn/northd/ovn-northd.8.xml
+++ b/ovn/northd/ovn-northd.8.xml
@@ -2002,7 +2002,86 @@  next;
       </li>
     </ul>
 
-    <h3>Ingress Table 9: Gateway Redirect</h3>
+    <h3>Ingress Table 9: Check packet length</h3>
+
+    <p>
+      For distributed logical routers with distributed gateway port configured
+      with <code>options:gateway_mtu</code> to a valid integer value, this
+      table adds a priority-50 logical flow with the match
+      <code>ip4 &amp;&amp; outport == <var>GW_PORT</var></code> where
+      <var>GW_PORT</var> is the distributed gateway router port and applies the
+      action <code>check_pkt_larger</code> and advances the packet to the
+      next table.
+    </p>
+
+    <pre>
+REGBIT_PKT_LARGER = check_pkt_larger(<var>L</var>); next;
+    </pre>
+
+    <p>
+      where <var>L</var> is the packet length to check for. If the packet
+      is larger than <var>L</var>, it stores 1 in the register bit
+      <code>REGBIT_PKT_LARGER</code>. The value of
+      <var>L</var> is taken from <ref column="options:gateway_mtu"
+      table="Logical_Router_Port" db="OVN_Northbound"/> column of
+      <ref table="Logical_Router_Port" db="OVN_Northbound"/> row.
+    </p>
+
+    <p>
+      This table adds one priority-0 fallback flow that matches all packets
+      and advances to the next table.
+    </p>
+
+    <h3>Ingress Table 10: Handle larger packets</h3>
+
+    <p>
+      For distributed logical routers with distributed gateway port configured
+      with <code>options:gateway_mtu</code> to a valid integer value, this
+      table adds the following priority-50 logical flow for each
+      logical router port with the match <code>ip4 &amp;&amp;
+      inport == <var>LRP</var> &amp;&amp; outport == <var>GW_PORT</var>
+      &amp;&amp; REGBIT_PKT_LARGER</code>, where <var>LRP</var> is the logical
+      router port and <var>GW_PORT</var> is the distributed gateway router port
+      and applies the following action
+    </p>
+
+    <pre>
+icmp4 {
+    icmp4.type = 3; /* Destination Unreachable. */
+    icmp4.code = 4;  /* Frag Needed and DF was Set. */
+    icmp4.frag_mtu = <var>M</var>;
+    eth.dst = <var>E</var>;
+    ip4.dst = ip4.src;
+    ip4.src = <var>I</var>;
+    ip.ttl = 255;
+    REGBIT_EGRESS_LOOPBACK = 1;
+    next(pipeline=ingress, table=0);
+};
+    </pre>
+
+    <ul>
+      <li>
+        Where <var>M</var> is the (fragment MTU - 58) whose value is taken from
+        <ref column="options:gateway_mtu" table="Logical_Router_Port"
+        db="OVN_Northbound"/> column of
+        <ref table="Logical_Router_Port" db="OVN_Northbound"/> row.
+      </li>
+
+      <li>
+        <var>E</var> is the Ethernet address of the logical router port.
+      </li>
+
+      <li>
+        <var>I</var> is the IPv4 address of the logical router port.
+      </li>
+    </ul>
+
+    <p>
+      This table adds one priority-0 fallback flow that matches all packets
+      and advances to the next table.
+    </p>
+
+    <h3>Ingress Table 11: Gateway Redirect</h3>
 
     <p>
       For distributed logical routers where one of the logical router
@@ -2059,7 +2138,7 @@  next;
       </li>
     </ul>
 
-    <h3>Ingress Table 10: ARP Request</h3>
+    <h3>Ingress Table 12: ARP Request</h3>
 
     <p>
       In the common case where the Ethernet destination has been resolved, this
diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
index 5614f9fa3..92e0e9c9d 100644
--- a/ovn/northd/ovn-northd.c
+++ b/ovn/northd/ovn-northd.c
@@ -142,8 +142,10 @@  enum ovn_stage {
     PIPELINE_STAGE(ROUTER, IN,  ND_RA_RESPONSE, 6, "lr_in_nd_ra_response") \
     PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING,     7, "lr_in_ip_routing")   \
     PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,    8, "lr_in_arp_resolve")  \
-    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,    9, "lr_in_gw_redirect")  \
-    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,    10, "lr_in_arp_request")  \
+    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   , 9, "lr_in_chk_pkt_len")   \
+    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,    10,"lr_in_larger_pkts")   \
+    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,    11, "lr_in_gw_redirect")  \
+    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,    12, "lr_in_arp_request")  \
                                                                       \
     /* Logical router egress stages. */                               \
     PIPELINE_STAGE(ROUTER, OUT, UNDNAT,    0, "lr_out_undnat")        \
@@ -179,6 +181,8 @@  enum ovn_stage {
  * logical router dropping packets with source IP address equals
  * one of the logical router's own IP addresses. */
 #define REGBIT_EGRESS_LOOPBACK  "reg9[1]"
+/* Register to store the result of check_pkt_larger action. */
+#define REGBIT_PKT_LARGER        "reg9[2]"
 
 /* Returns an "enum ovn_stage" built from the arguments. */
 static enum ovn_stage
@@ -6595,7 +6599,87 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
                       "get_nd(outport, xxreg0); next;");
     }
 
-    /* Logical router ingress table 9: Gateway redirect.
+    /* Local router ingress table 9: Check packet length.
+     *
+     * Any IPv4 packet with outport set to the distributed gateway
+     * router port, check the packet length and store the result in the
+     * 'REGBIT_PKT_LARGER' register bit.
+     *
+     * Local router ingress table 10: Handle larger packets.
+     *
+     * Any IPv4 packet with outport set to the distributed gateway
+     * router port and the 'REGBIT_PKT_LARGER' register bit is set,
+     * generate ICMPv4 packet with type 3 (Destination Unreachable) and
+     * code 4 (Fragmentation needed).
+     * */
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+        if (!od->nbr) {
+            continue;
+        }
+
+        /* Packets are allowed by default. */
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_CHK_PKT_LEN, 0, "1",
+                      "next;");
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_LARGER_PKTS, 0, "1",
+                      "next;");
+
+        if (od->l3dgw_port && od->l3redirect_port) {
+            int gw_mtu = 0;
+            if (od->l3dgw_port->nbrp) {
+                 gw_mtu = smap_get_int(&od->l3dgw_port->nbrp->options,
+                                       "gateway_mtu", 0);
+            }
+            /* Add the flows only if gateway_mtu is configured. */
+            if (gw_mtu <= 0) {
+                continue;
+            }
+
+            ds_clear(&match);
+            ds_put_format(&match, "outport == %s && ip4",
+                          od->l3dgw_port->json_key);
+
+            ds_clear(&actions);
+            ds_put_format(&actions,
+                          REGBIT_PKT_LARGER" = check_pkt_larger(%d);"
+                          " next;", gw_mtu);
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_CHK_PKT_LEN, 50,
+                          ds_cstr(&match), ds_cstr(&actions));
+
+            for (size_t i = 0; i < od->nbr->n_ports; i++) {
+                struct ovn_port *rp = ovn_port_find(ports,
+                                                    od->nbr->ports[i]->name);
+                if (!rp || rp == od->l3dgw_port) {
+                    continue;
+                }
+                ds_clear(&match);
+                ds_put_format(&match, "inport == %s && outport == %s && ip4 "
+                              "&& "REGBIT_PKT_LARGER,
+                              rp->json_key, od->l3dgw_port->json_key);
+
+                ds_clear(&actions);
+                /* Set icmp4.frag_mtu to gw_mtu - 58. 58 is the Geneve tunnel
+                 * overhead. */
+                ds_put_format(&actions,
+                    "icmp4_error {"
+                    REGBIT_EGRESS_LOOPBACK" = 1; "
+                    "eth.dst = %s; "
+                    "ip4.dst = ip4.src; "
+                    "ip4.src = %s; "
+                    "ip.ttl = 255; "
+                    "icmp4.type = 3; /* Destination Unreachable. */ "
+                    "icmp4.code = 4; /* Frag Needed and DF was Set. */ "
+                    "icmp4.frag_mtu = %d; "
+                    "next(pipeline=ingress, table=0); };",
+                    rp->lrp_networks.ea_s,
+                    rp->lrp_networks.ipv4_addrs[0].addr_s,
+                    gw_mtu - 18);
+                ovn_lflow_add(lflows, od, S_ROUTER_IN_LARGER_PKTS, 50,
+                              ds_cstr(&match), ds_cstr(&actions));
+            }
+        }
+    }
+
+    /* Logical router ingress table 11: Gateway redirect.
      *
      * For traffic with outport equal to the l3dgw_port
      * on a distributed router, this table redirects a subset
@@ -6635,7 +6719,7 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 0, "1", "next;");
     }
 
-    /* Local router ingress table 10: ARP request.
+    /* Local router ingress table 12: ARP request.
      *
      * In the common case where the Ethernet destination has been resolved,
      * this table outputs the packet (priority 0).  Otherwise, it composes
diff --git a/tests/ovn.at b/tests/ovn.at
index 043697120..acb8c832d 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -11972,6 +11972,7 @@  as hv2 start_daemon ovn-controller
 
 OVN_CLEANUP([hv1],[hv2])
 
+
 AT_CLEANUP
 
 AT_SETUP([ovn -- ovn-nbctl duplicate addresses])
@@ -12030,6 +12031,172 @@  AT_CHECK([ovn-nbctl lsp-set-addresses sw2-p1 "00:00:00:00:00:04 aef0::1"])
 
 AT_CLEANUP
 
+AT_SETUP([ovn -- router - check packet length - icmp defrag])
+AT_KEYWORDS([check packet length])
+AT_SKIP_IF([test $HAVE_PYTHON = no])
+ovn_start
+
+ovn-nbctl ls-add sw0
+ovn-nbctl lsp-add sw0 sw0-port1
+ovn-nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:01 10.0.0.3"
+
+ovn-nbctl lr-add lr0
+ovn-nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24
+ovn-nbctl lsp-add sw0 sw0-lr0
+ovn-nbctl lsp-set-type sw0-lr0 router
+ovn-nbctl lsp-set-addresses sw0-lr0 router
+ovn-nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
+
+ovn-nbctl ls-add public
+ovn-nbctl lrp-add lr0 lr0-public 00:00:20:20:12:13 172.168.0.100/24
+ovn-nbctl lsp-add public public-lr0
+ovn-nbctl lsp-set-type public-lr0 router
+ovn-nbctl lsp-set-addresses public-lr0 router
+ovn-nbctl lsp-set-options public-lr0 router-port=lr0-public
+
+# localnet port
+ovn-nbctl lsp-add public ln-public
+ovn-nbctl lsp-set-type ln-public localnet
+ovn-nbctl lsp-set-addresses ln-public unknown
+ovn-nbctl lsp-set-options ln-public network_name=phys
+
+ovn-nbctl lrp-set-gateway-chassis lr0-public hv1 20
+ovn-nbctl lr-nat-add lr0 snat 172.168.0.100 10.0.0.0/24
+
+net_add n1
+
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+ovs-vsctl -- add-port br-int hv1-vif1 -- \
+    set interface hv1-vif1 external-ids:iface-id=sw0-port1 \
+    options:tx_pcap=hv1/vif1-tx.pcap \
+    options:rxq_pcap=hv1/vif1-rx.pcap \
+    ofport-request=1
+
+reset_pcap_file() {
+     local iface=$1
+     local pcap_file=$2
+     ovs-vsctl -- set Interface $iface options:tx_pcap=dummy-tx.pcap \
+ options:rxq_pcap=dummy-rx.pcap
+     rm -f ${pcap_file}*.pcap
+     ovs-vsctl -- set Interface $iface options:tx_pcap=${pcap_file}-tx.pcap \
+ options:rxq_pcap=${pcap_file}-rx.pcap
+}
+
+ip_to_hex() {
+     printf "%02x%02x%02x%02x" "$@"
+}
+
+test_ip_packet_larger() {
+    local icmp_pmtu_reply_expected=$1
+
+    # Send ip packet from sw0-port1 to outside
+    src_mac="505400000001" # sw-port1 mac
+    dst_mac="00000000ff01" # sw0-lr0 mac (internal router leg)
+    src_ip=`ip_to_hex 10 0 0 3`
+    dst_ip=`ip_to_hex 172 168 0 3`
+    # Set the packet length to 100.
+    pkt_len=0064
+    packet=${dst_mac}${src_mac}08004500${pkt_len}0000000040010000
+    orig_packet_l3=${src_ip}${dst_ip}0304000000000000
+    orig_packet_l3=${orig_packet_l3}000000000000000000000000000000000000
+    orig_packet_l3=${orig_packet_l3}000000000000000000000000000000000000
+    orig_packet_l3=${orig_packet_l3}000000000000000000000000000000000000
+    orig_packet_l3=${orig_packet_l3}000000000000000000000000000000000000
+    packet=${packet}${orig_packet_l3}
+
+    # If icmp_pmtu_reply_expected is 0, it means the packet is lesser than
+    # the gateway mtu and should be delivered to the provider bridge via the
+    # localnet port.
+    # If icmp_pmtu_reply_expected is 1, it means the packet is larger than
+    # the gateway mtu and ovn-controller should drop the packet and instead
+    # generate ICMPv4  Destination Unreachable message with pmtu set to 42.
+    if test $icmp_pmtu_reply_expected = 0; then
+        # Packet to expect at br-phys.
+        src_mac="000020201213"
+        dst_mac="00000012af11"
+        src_ip=`ip_to_hex 10 0 0 3`
+        dst_ip=`ip_to_hex 172 168 0 3`
+        expected=${dst_mac}${src_mac}08004500${pkt_len}000000003f010100
+        expected=${expected}${src_ip}${dst_ip}0304000000000000
+        expected=${expected}000000000000000000000000000000000000
+        expected=${expected}000000000000000000000000000000000000
+        expected=${expected}000000000000000000000000000000000000
+        expected=${expected}000000000000000000000000000000000000
+        echo $expected > br_phys_n1.expected
+    else
+        # MTU would be 100 - 18 = 82 (hex 0052)
+        mtu=0052
+        src_ip=`ip_to_hex 10 0 0 1`
+        dst_ip=`ip_to_hex 10 0 0 3`
+        # pkt len should be 128 (28 (icmp packet) + 100 (orig ip + payload))
+        reply_pkt_len=0080
+        ip_csum=bd91
+        icmp_reply=${src_mac}${dst_mac}08004500${reply_pkt_len}00004000fe016879
+        icmp_reply=${icmp_reply}${src_ip}${dst_ip}0304${ip_csum}0000${mtu}
+        icmp_reply=${icmp_reply}4500${pkt_len}000000003f010100
+        icmp_reply=${icmp_reply}${orig_packet_l3}
+        echo $icmp_reply > hv1-vif1.expected
+    fi
+
+    as hv1 reset_pcap_file br-phys_n1 hv1/br-phys_n1
+    as hv1 reset_pcap_file hv1-vif1 hv1/vif1
+
+    # Send packet from sw0-port1 to outside
+    as hv1 ovs-appctl netdev-dummy/receive hv1-vif1 $packet
+
+    if test $icmp_pmtu_reply_expected = 0; then
+        OVN_CHECK_PACKETS([hv1/br-phys_n1-tx.pcap], [br_phys_n1.expected])
+        $PYTHON "$top_srcdir/utilities/ovs-pcap.in" hv1/vif1-tx.pcap  > packets
+        AT_CHECK([cat packets], [0], [])
+    else
+        OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [hv1-vif1.expected])
+        $PYTHON "$top_srcdir/utilities/ovs-pcap.in" hv1/br-phys_n1-tx.pcap  > \
+        packets
+        AT_CHECK([cat packets], [0], [])
+    fi
+}
+
+ovn-nbctl show
+ovn-sbctl show
+
+AT_CHECK([as hv1 ovs-ofctl dump-flows br-int  \
+| grep "check_pkt_larger" | wc -l], [0], [[0
+]])
+dp_uuid=$(ovn-sbctl find datapath_binding | grep sw0 -B2 | grep _uuid | \
+awk '{print $3}')
+ovn-sbctl create MAC_Binding ip=172.168.0.3 datapath=$dp_uuid \
+logical_port=lr0-public mac="00\:00\:00\:12\:af\:11"
+
+# Set the gateway mtu to 100. If the packet length is > 100, ovn-controller
+# should send icmp host not reachable with pmtu set to 100.
+ovn-nbctl --wait=hv set logical_router_port lr0-public options:gateway_mtu=100
+as hv3 ovs-appctl netdev-dummy/receive hv3-vif1 $arp_reply
+OVS_WAIT_UNTIL([
+    test `as hv1 ovs-ofctl dump-flows br-int | grep "check_pkt_larger(100)" | \
+    wc -l` -eq 1
+])
+
+icmp_reply_expected=1
+test_ip_packet_larger $icmp_reply_expected
+
+# Set the gateway mtu to 500.
+ovn-nbctl --wait=hv set logical_router_port lr0-public options:gateway_mtu=500
+as hv3 ovs-appctl netdev-dummy/receive hv3-vif1 $arp_reply
+OVS_WAIT_UNTIL([
+    test `as hv1 ovs-ofctl dump-flows br-int | grep "check_pkt_larger(500)" | \
+    wc -l` -eq 1
+])
+
+# Now the packet should be sent via the localnet port to br-phys.
+icmp_reply_expected=0
+test_ip_packet_larger $icmp_reply_expected
+OVN_CLEANUP([hv1])
+AT_CLEANUP
+
 AT_SETUP([ovn -- IP packet buffering])
 AT_KEYWORDS([ip-buffering])
 AT_SKIP_IF([test $HAVE_PYTHON = no])