diff mbox series

[ovs-dev,RFC,v2,5/5] ovn: Generate ICMPv4 packet in router pipeline for larger packets

Message ID 20190103181916.17678-1-nusiddiq@redhat.com
State Superseded
Headers show
Series Address MTU issue for larger packets in OVN | expand

Commit Message

Numan Siddique Jan. 3, 2019, 6:19 p.m. UTC
From: Numan Siddique <nusiddiq@redhat.com>

Signed-off-by: Numan Siddique <nusiddiq@redhat.com>
---
 ovn/northd/ovn-northd.8.xml |  83 +++++++++++++++++-
 ovn/northd/ovn-northd.c     |  89 ++++++++++++++++++-
 tests/ovn.at                | 165 ++++++++++++++++++++++++++++++++++++
 3 files changed, 331 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/ovn/northd/ovn-northd.8.xml b/ovn/northd/ovn-northd.8.xml
index 392a5efc9..a3922e8e1 100644
--- a/ovn/northd/ovn-northd.8.xml
+++ b/ovn/northd/ovn-northd.8.xml
@@ -2002,7 +2002,86 @@  next;
       </li>
     </ul>
 
-    <h3>Ingress Table 9: Gateway Redirect</h3>
+    <h3>Ingress Table 9: Check packet length</h3>
+
+    <p>
+      For distributed logical routers with distributed gateway port configured
+      with <code>options:gateway_mtu</code> to a valid integer value, this
+      table adds a priority-50 logical flow with the match
+      <code>ip4 &amp;&amp; outport == <var>GW_PORT</var></code> where
+      <var>GW_PORT</var> is the distributed gateway router port and applies the
+      action <code>check_pkt_larger</code> and advances the packet to the
+      next table.
+    </p>
+
+    <pre>
+REGBIT_PKT_LARGER = check_pkt_larger(<var>L</var>); next;
+    </pre>
+
+    <p>
+      where <var>L</var> is the packet length to check for. If the packet
+      is larger than <var>L</var>, it stores 1 in the register bit
+      <code>REGBIT_PKT_LARGER</code>. The value of
+      <var>L</var> is taken from <ref column="options:gateway_mtu"
+      table="Logical_Router_Port" db="OVN_Northbound"/> column of
+      <ref table="Logical_Router_Port" db="OVN_Northbound"/> row.
+    </p>
+
+    <p>
+      This table adds one priority-0 fallback flow that matches all packets
+      and advances to the next table.
+    </p>
+
+    <h3>Ingress Table 10: Handle larger packets</h3>
+
+    <p>
+      For distributed logical routers with distributed gateway port configured
+      with <code>options:gateway_mtu</code> to a valid integer value, this
+      table adds the following priority-50 logical flow for each
+      logical router port with the match <code>ip4 &amp;&amp;
+      inport == <var>LRP</var> &amp;&amp; outport == <var>GW_PORT</var>
+      &amp;&amp; REGBIT_PKT_LARGER</code>, where <var>LRP</var> is the logical
+      router port and <var>GW_PORT</var> is the distributed gateway router port
+      and applies the following action
+    </p>
+
+    <pre>
+icmp4 {
+    icmp4.type = 3; /* Destination Unreachable. */
+    icmp4.code = 4;  /* Frag Needed and DF was Set. */
+    icmp4.frag_mtu = <var>M</var>;
+    eth.dst = <var>E</var>;
+    ip4.dst = ip4.src;
+    ip4.src = <var>I</var>;
+    ip.ttl = 255;
+    REGBIT_EGRESS_LOOPBACK = 1;
+    next(pipeline=ingress, table=0);
+};
+    </pre>
+
+    <ul>
+      <li>
+        Where <var>M</var> is the fragment MTU whose value is taken from
+        <ref column="options:gateway_mtu" table="Logical_Router_Port"
+        db="OVN_Northbound"/> column of
+        <ref table="Logical_Router_Port" db="OVN_Northbound"/> row.
+      </li>
+
+      <li>
+        <var>E</var> is the Ethernet address of the logical router port.
+      </li>
+
+      <li>
+        <var>I</var> is the IPv4 address of the logical router port.
+      </li>
+    </ul>
+
+    <p>
+      This table adds one priority-0 fallback flow that matches all packets
+      and advances to the next table.
+    </p>
+
+    <h3>Ingress Table 11: Gateway Redirect</h3>
 
     <p>
       For distributed logical routers where one of the logical router
@@ -2059,7 +2138,7 @@  next;
       </li>
     </ul>
 
-    <h3>Ingress Table 10: ARP Request</h3>
+    <h3>Ingress Table 12: ARP Request</h3>
 
     <p>
       In the common case where the Ethernet destination has been resolved, this
diff --git a/ovn/northd/ovn-northd.c b/ovn/northd/ovn-northd.c
index ebf829278..9f0621c41 100644
--- a/ovn/northd/ovn-northd.c
+++ b/ovn/northd/ovn-northd.c
@@ -143,8 +143,10 @@  enum ovn_stage {
     PIPELINE_STAGE(ROUTER, IN,  ND_RA_RESPONSE, 6, "lr_in_nd_ra_response") \
     PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING,     7, "lr_in_ip_routing")   \
     PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,    8, "lr_in_arp_resolve")  \
-    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,    9, "lr_in_gw_redirect")  \
-    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,    10, "lr_in_arp_request")  \
+    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   , 9, "lr_in_chk_pkt_len")   \
+    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,    10,"lr_in_larger_pkts")   \
+    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,    11, "lr_in_gw_redirect")  \
+    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,    12, "lr_in_arp_request")  \
                                                                       \
     /* Logical router egress stages. */                               \
     PIPELINE_STAGE(ROUTER, OUT, UNDNAT,    0, "lr_out_undnat")        \
@@ -180,6 +182,8 @@  enum ovn_stage {
  * logical router dropping packets with source IP address equals
  * one of the logical router's own IP addresses. */
 #define REGBIT_EGRESS_LOOPBACK  "reg9[1]"
+/* Register to store the result of check_pkt_larger action. */
+#define REGBIT_PKT_LARGER        "reg9[2]"
 
 /* Returns an "enum ovn_stage" built from the arguments. */
 static enum ovn_stage
@@ -6574,7 +6578,84 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
                       "get_nd(outport, xxreg0); next;");
     }
 
-    /* Logical router ingress table 9: Gateway redirect.
+    /* Local router ingress table 9: Check packet length.
+     *
+     * Any IPv4 packet with outport set to the distributed gateway
+     * router port, check the packet length and store the result in the
+     * 'REGBIT_PKT_LARGER' register bit.
+     *
+     * Local router ingress table 10: Handle larger packets.
+     *
+     * Any IPv4 packet with outport set to the distributed gateway
+     * router port and the 'REGBIT_PKT_LARGER' register bit is set,
+     * generate ICMPv4 packet with type 3 (Destination Unreachable) and
+     * code 4 (Fragmentation needed).
+     * */
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+        if (!od->nbr) {
+            continue;
+        }
+
+        /* Packets are allowed by default. */
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_CHK_PKT_LEN, 0, "1",
+                      "next;");
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_LARGER_PKTS, 0, "1",
+                      "next;");
+
+        if (od->l3dgw_port && od->l3redirect_port) {
+            int gw_mtu = 0;
+            if (od->l3dgw_port->nbrp) {
+                 gw_mtu = smap_get_int(&od->l3dgw_port->nbrp->options,
+                                       "gateway_mtu", 0);
+            }
+            /* Add the flows only if gateway_mtu is configured. */
+            if (gw_mtu <= 0) {
+                continue;
+            }
+
+            ds_clear(&match);
+            ds_put_format(&match, "outport == %s && ip4",
+                          od->l3dgw_port->json_key);
+
+            ds_clear(&actions);
+            ds_put_format(&actions,
+                          REGBIT_PKT_LARGER" = check_pkt_larger(%d);"
+                          " next;", gw_mtu);
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_CHK_PKT_LEN, 50,
+                          ds_cstr(&match), ds_cstr(&actions));
+
+            for (size_t i = 0; i < od->nbr->n_ports; i++) {
+                struct ovn_port *rp = ovn_port_find(ports,
+                                                    od->nbr->ports[i]->name);
+                if (!rp || rp == od->l3dgw_port) {
+                    continue;
+                }
+                ds_clear(&match);
+                ds_put_format(&match, "inport == %s && outport == %s && ip4 "
+                              "&& "REGBIT_PKT_LARGER,
+                              rp->json_key, od->l3dgw_port->json_key);
+
+                ds_clear(&actions);
+                ds_put_format(&actions,
+                    "icmp4 {"
+                    "icmp4.type = 3; /* Destination Unreachable. */ "
+                    "icmp4.code = 4; /* Frag Needed and DF was Set. */ "
+                    "icmp4.frag_mtu = %d; "
+                    "eth.dst = %s; "
+                    "ip4.dst = ip4.src; "
+                    "ip4.src = %s; "
+                    "ip.ttl = 255; "
+                    REGBIT_EGRESS_LOOPBACK" = 1; "
+                    "next(pipeline=ingress, table=0); };",
+                    gw_mtu - 58, rp->lrp_networks.ea_s,
+                    rp->lrp_networks.ipv4_addrs[0].addr_s);
+                ovn_lflow_add(lflows, od, S_ROUTER_IN_LARGER_PKTS, 50,
+                              ds_cstr(&match), ds_cstr(&actions));
+            }
+        }
+    }
+
+    /* Logical router ingress table 11: Gateway redirect.
      *
      * For traffic with outport equal to the l3dgw_port
      * on a distributed router, this table redirects a subset
@@ -6614,7 +6695,7 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         ovn_lflow_add(lflows, od, S_ROUTER_IN_GW_REDIRECT, 0, "1", "next;");
     }
 
-    /* Local router ingress table 10: ARP request.
+    /* Local router ingress table 12: ARP request.
      *
      * In the common case where the Ethernet destination has been resolved,
      * this table outputs the packet (priority 0).  Otherwise, it composes
diff --git a/tests/ovn.at b/tests/ovn.at
index 442b3a917..117b3ac3d 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -11707,6 +11707,7 @@  as hv2 start_daemon ovn-controller
 
 OVN_CLEANUP([hv1],[hv2])
 
+
 AT_CLEANUP
 
 AT_SETUP([ovn -- ovn-nbctl duplicate addresses])
@@ -11765,6 +11766,170 @@  AT_CHECK([ovn-nbctl lsp-set-addresses sw2-p1 "00:00:00:00:00:04 aef0::1"])
 
 AT_CLEANUP
 
+AT_SETUP([ovn -- router - check packet length - icmp defrag])
+AT_KEYWORDS([check packet length])
+AT_SKIP_IF([test $HAVE_PYTHON = no])
+ovn_start
+
+ovn-nbctl ls-add sw0
+ovn-nbctl lsp-add sw0 sw0-port1
+ovn-nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:01 10.0.0.3"
+
+ovn-nbctl lr-add lr0
+ovn-nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24
+ovn-nbctl lsp-add sw0 sw0-lr0
+ovn-nbctl lsp-set-type sw0-lr0 router
+ovn-nbctl lsp-set-addresses sw0-lr0 router
+ovn-nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
+
+ovn-nbctl ls-add public
+ovn-nbctl lrp-add lr0 lr0-public 00:00:20:20:12:13 172.168.0.100/24
+ovn-nbctl lsp-add public public-lr0
+ovn-nbctl lsp-set-type public-lr0 router
+ovn-nbctl lsp-set-addresses public-lr0 router
+ovn-nbctl lsp-set-options public-lr0 router-port=lr0-public
+
+# localnet port
+ovn-nbctl lsp-add public ln-public
+ovn-nbctl lsp-set-type ln-public localnet
+ovn-nbctl lsp-set-addresses ln-public unknown
+ovn-nbctl lsp-set-options ln-public network_name=phys
+
+ovn-nbctl lrp-set-gateway-chassis lr0-public hv1 20
+ovn-nbctl lr-nat-add lr0 snat 172.168.0.100 10.0.0.0/24
+
+net_add n1
+
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+ovs-vsctl -- add-port br-int hv1-vif1 -- \
+    set interface hv1-vif1 external-ids:iface-id=sw0-port1 \
+    options:tx_pcap=hv1/vif1-tx.pcap \
+    options:rxq_pcap=hv1/vif1-rx.pcap \
+    ofport-request=1
+
+reset_pcap_file() {
+     local iface=$1
+     local pcap_file=$2
+     ovs-vsctl -- set Interface $iface options:tx_pcap=dummy-tx.pcap \
+ options:rxq_pcap=dummy-rx.pcap
+     rm -f ${pcap_file}*.pcap
+     ovs-vsctl -- set Interface $iface options:tx_pcap=${pcap_file}-tx.pcap \
+ options:rxq_pcap=${pcap_file}-rx.pcap
+}
+
+ip_to_hex() {
+     printf "%02x%02x%02x%02x" "$@"
+}
+
+test_ip_packet_larger() {
+    local icmp_pmtu_reply_expected=$1
+
+    # Send ip packet from sw0-port1 to outside
+    src_mac="505400000001" # sw-port1 mac
+    dst_mac="00000000ff01" # sw0-lr0 mac (internal router leg)
+    src_ip=`ip_to_hex 10 0 0 3`
+    dst_ip=`ip_to_hex 172 168 0 3`
+    # Set the packet length to 100.
+    pkt_len=0064
+    packet=${dst_mac}${src_mac}08004500${pkt_len}0000000040010000
+    packet=${packet}${src_ip}${dst_ip}0304000000000000
+    packet=${packet}000000000000000000000000000000000000
+    packet=${packet}000000000000000000000000000000000000
+    packet=${packet}000000000000000000000000000000000000
+    packet=${packet}000000000000000000000000000000000000
+
+    # If icmp_pmtu_reply_expected is 0, it means the packet is lesser than
+    # the gateway mtu and should be delivered to the provider bridge via the
+    # localnet port.
+    # If icmp_pmtu_reply_expected is 1, it means the packet is larger than
+    # the gateway mtu and ovn-controller should drop the packet and instead
+    # generate ICMPv4  Destination Unreachable message with pmtu set to 100.
+    if test $icmp_pmtu_reply_expected = 0; then
+        # Packet to expect at br-phys.
+        src_mac="000020201213"
+        dst_mac="00000012af11"
+        src_ip=`ip_to_hex 10 0 0 3`
+        dst_ip=`ip_to_hex 172 168 0 3`
+        expected=${dst_mac}${src_mac}08004500${pkt_len}000000003f010100
+        expected=${expected}${src_ip}${dst_ip}0304000000000000
+        expected=${expected}000000000000000000000000000000000000
+        expected=${expected}000000000000000000000000000000000000
+        expected=${expected}000000000000000000000000000000000000
+        expected=${expected}000000000000000000000000000000000000
+        echo $expected > br_phys_n1.expected
+    else
+        mtu=$pkt_len
+        src_ip=`ip_to_hex 10 0 0 1`
+        dst_ip=`ip_to_hex 10 0 0 3`
+        orig_pkt_src_ip=`ip_to_hex 10 0 0 3`
+        orig_pkt_dst_ip=`ip_to_hex 172 168 0 3`
+        icmp_reply=${src_mac}${dst_mac}08004500003800004000fe0168c1
+        icmp_reply=${icmp_reply}${src_ip}${dst_ip}0304bd7f0000${mtu}
+        icmp_reply=${icmp_reply}4500${pkt_len}000000003f010100
+        icmp_reply=${icmp_reply}${orig_pkt_src_ip}${orig_pkt_dst_ip}
+        icmp_reply=${icmp_reply}0304000000000000
+        echo $icmp_reply > hv1-vif1.expected
+    fi
+
+    as hv1 reset_pcap_file br-phys_n1 hv1/br-phys_n1
+    as hv1 reset_pcap_file hv1-vif1 hv1/vif1
+
+    # Send packet from sw0-port1 to outside
+    as hv1 ovs-appctl netdev-dummy/receive hv1-vif1 $packet
+
+    if test $icmp_pmtu_reply_expected = 0; then
+        OVN_CHECK_PACKETS([hv1/br-phys_n1-tx.pcap], [br_phys_n1.expected])
+        $PYTHON "$top_srcdir/utilities/ovs-pcap.in" hv1/vif1-tx.pcap  > packets
+        AT_CHECK([cat packets], [0], [])
+    else
+        OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [hv1-vif1.expected])
+        $PYTHON "$top_srcdir/utilities/ovs-pcap.in" hv1/br-phys_n1-tx.pcap  > \
+        packets
+        AT_CHECK([cat packets], [0], [])
+    fi
+}
+
+ovn-nbctl show
+ovn-sbctl show
+
+AT_CHECK([as hv1 ovs-ofctl dump-flows br-int  \
+| grep "check_pkt_larger" | wc -l], [0], [[0
+]])
+dp_uuid=$(ovn-sbctl find datapath_binding | grep sw0 -B2 | grep _uuid | \
+awk '{print $3}')
+ovn-sbctl create MAC_Binding ip=172.168.0.3 datapath=$dp_uuid \
+logical_port=lr0-public mac="00\:00\:00\:12\:af\:11"
+
+# Set the gateway mtu to 100. If the packet length is > 100, ovn-controller
+# should send icmp host not reachable with pmtu set to 100.
+ovn-nbctl --wait=hv set logical_router_port lr0-public options:gateway_mtu=100
+as hv3 ovs-appctl netdev-dummy/receive hv3-vif1 $arp_reply
+OVS_WAIT_UNTIL([
+    test `as hv1 ovs-ofctl dump-flows br-int | grep "check_pkt_larger(100)" | \
+    wc -l` -eq 1
+])
+
+icmp_reply_expected=1
+test_ip_packet_larger $icmp_reply_expected
+
+# Set the gateway mtu to 500.
+ovn-nbctl --wait=hv set logical_router_port lr0-public options:gateway_mtu=500
+as hv3 ovs-appctl netdev-dummy/receive hv3-vif1 $arp_reply
+OVS_WAIT_UNTIL([
+    test `as hv1 ovs-ofctl dump-flows br-int | grep "check_pkt_larger(500)" | \
+    wc -l` -eq 1
+])
+
+# Now the packet should be sent via the localnet port to br-phys.
+icmp_reply_expected=0
+test_ip_packet_larger $icmp_reply_expected
+OVN_CLEANUP([hv1])
+AT_CLEANUP
+
 AT_SETUP([ovn -- IP packet buffering])
 AT_KEYWORDS([ip-buffering])
 AT_SKIP_IF([test $HAVE_PYTHON = no])