diff mbox series

[ovs-dev,RFC] userspace: Enable tunnel with TSO.

Message ID 1612375975-59157-1-git-send-email-u9012063@gmail.com
State New
Headers show
Series [ovs-dev,RFC] userspace: Enable tunnel with TSO. | expand

Commit Message

William Tu Feb. 3, 2021, 6:12 p.m. UTC
Currently when setting 'userspace-tso-enable=true', tunnel test cases
fail due to incorrect checksum, at inner header and outer header.
The patch recalculates the checksum before packet is outputting to
a port (tunnel and tap), makes sure the receiver sees correct checksum.

Consider the following cases:
1) veth -> ovs -> veth, and 2) tap -> ovs -> tap
No need to recalc csum because vnet hdr carries the offload
information.

3) decap: vxlan tunnel -> br-underlay -> br-overlay
The inner packet is sent to br-overlay (which is a tap).
Need to fix the inner header's csum.

4) encap: br-overlay -> br-underlay -> vxlan tunnel
Fix the inner csum before pushing the outer header.

I added iperf and pass vxlan and geneve tests:
$ make check-system-tso TESTSUITEFLAGS="-k vxlan"
$ make check-system-tso TESTSUITEFLAGS="-k geneve"

While TCP works over tunnel, the TCP sender sending huge
packet size will fail. I have to segment the inner TCP
packet before pushing the outer tunnel header.

Signed-off-by: William Tu <u9012063@gmail.com>
---
 lib/netdev-linux.c      |  2 +-
 lib/netdev-native-tnl.c | 11 ++++++++++-
 lib/netdev.c            | 18 ++++++------------
 lib/packets.c           | 34 ++++++++++++++++++++++++++++++++++
 lib/packets.h           |  1 +
 tests/system-tap.at     |  3 +++
 tests/system-traffic.at |  9 +++++++++
 7 files changed, 64 insertions(+), 14 deletions(-)
diff mbox series

Patch

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 6be23dbeed57..bb365b3b0da3 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1446,7 +1446,6 @@  netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu,
                          netdev_get_name(netdev_));
             continue;
         }
-
         dp_packet_batch_add(batch, pkt);
     }
 
@@ -1604,6 +1603,7 @@  netdev_linux_tap_batch_send(struct netdev *netdev_, bool tso, int mtu,
         int error;
 
         if (tso) {
+            packet_csum_tcpudp(packet);
             netdev_linux_prepend_vnet_hdr(packet, mtu);
         }
 
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index b89dfdd52a86..003c78a151f8 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -43,6 +43,7 @@ 
 #include "seq.h"
 #include "unaligned.h"
 #include "unixctl.h"
+#include "userspace-tso.h"
 #include "openvswitch/vlog.h"
 
 VLOG_DEFINE_THIS_MODULE(native_tnl);
@@ -153,6 +154,12 @@  netdev_tnl_push_ip_header(struct dp_packet *packet,
     struct ip_header *ip;
     struct ovs_16aligned_ip6_hdr *ip6;
 
+    if (userspace_tso_enabled()) {
+        /* Calculate inner header's checksum before pushing outer header.
+         * (Assume the device does not support tnl checksum) */
+        packet_csum_tcpudp(packet);
+    }
+
     eth = dp_packet_push_uninit(packet, size);
     *ip_tot_size = dp_packet_size(packet) - sizeof (struct eth_header);
 
@@ -189,7 +196,9 @@  udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
         return NULL;
     }
 
-    if (udp->udp_csum) {
+    /* 'udp->udp_csum' will be the pseudo header csum when when userspace
+     * TSO is enabled. Skip the validation. */
+    if (udp->udp_csum && !userspace_tso_enabled()) {
         if (OVS_UNLIKELY(!dp_packet_l4_checksum_valid(packet))) {
             uint32_t csum;
             if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) {
diff --git a/lib/netdev.c b/lib/netdev.c
index 91e91955c09b..bdf0000c45e9 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -960,18 +960,12 @@  netdev_push_header(const struct netdev *netdev,
     size_t i, size = dp_packet_batch_size(batch);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
-                         || dp_packet_hwol_l4_mask(packet))) {
-            COVERAGE_INC(netdev_push_header_drops);
-            dp_packet_delete(packet);
-            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
-                         "not supported: packet dropped",
-                         netdev_get_name(netdev));
-        } else {
-            netdev->netdev_class->push_header(netdev, packet, data);
-            pkt_metadata_init(&packet->md, data->out_port);
-            dp_packet_batch_refill(batch, packet, i);
-        }
+        /* Tunneling packet with HW offload flags is not supported. */
+        *dp_packet_ol_flags_ptr(packet) = 0;
+
+        netdev->netdev_class->push_header(netdev, packet, data);
+        pkt_metadata_init(&packet->md, data->out_port);
+        dp_packet_batch_refill(batch, packet, i);
     }
 
     return 0;
diff --git a/lib/packets.c b/lib/packets.c
index 4a7643c5dd3a..b0bb283acdfa 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -1887,3 +1887,37 @@  IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6)
         }
     }
 }
+
+void
+packet_csum_tcpudp(struct dp_packet *p)
+{
+    struct eth_header *eth;
+    struct ip_header *ip;
+    struct tcp_header *tcp;
+    struct udp_header *udp;
+    uint32_t pseudo_hdr_csum;
+    uint8_t l4proto;
+    size_t l4_size;
+
+    eth = dp_packet_eth(p);
+    if (eth->eth_type != htons(ETH_TYPE_IP)) {
+        return;
+    }
+
+    ip = dp_packet_l3(p);
+    l4proto = ip->ip_proto;
+    l4_size = dp_packet_l4_size(p);
+
+    if (l4proto == IPPROTO_TCP) {
+        pseudo_hdr_csum = packet_csum_pseudoheader(ip);
+        tcp = dp_packet_l4(p);
+        tcp->tcp_csum = 0;
+        tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum, tcp, l4_size));
+
+    } else if (l4proto == IPPROTO_UDP) {
+        pseudo_hdr_csum = packet_csum_pseudoheader(ip);
+        udp = dp_packet_l4(p);
+        udp->udp_csum = 0;
+        udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum, udp, l4_size));
+    }
+}
diff --git a/lib/packets.h b/lib/packets.h
index 481bc22fa1fe..1bea8c504811 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1634,6 +1634,7 @@  void packet_put_ra_prefix_opt(struct dp_packet *,
                               ovs_be32 preferred_lifetime,
                               const ovs_be128 router_prefix);
 uint32_t packet_csum_pseudoheader(const struct ip_header *);
+void packet_csum_tcpudp(struct dp_packet *p);
 void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
 
 #define DNS_HEADER_LEN 12
diff --git a/tests/system-tap.at b/tests/system-tap.at
index 871a3bda4fcc..be108c59b3c9 100644
--- a/tests/system-tap.at
+++ b/tests/system-tap.at
@@ -29,6 +29,9 @@  NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0],
 OVS_START_L7([at_ns1], [http])
 NS_CHECK_EXEC([at_ns0], [wget 10.1.1.2 -t 3 -T 1 --retry-connrefused -v -o wget0.log])
 
+NETNS_DAEMONIZE([at_ns0], [iperf -s], [iperf.pid])
+NS_CHECK_EXEC([at_ns1], [iperf -c 10.1.1.1 -t1 1> /dev/null], [0])
+
 OVS_TRAFFIC_VSWITCHD_STOP(["/.*ethtool command ETHTOOL_G.*/d"])
 
 AT_CLEANUP
diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index fb5b9a36d283..ed014953ca4e 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -248,6 +248,7 @@  dnl Okay, now check the overlay with different packet sizes
 NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl
 3 packets transmitted, 3 received, 0% packet loss, time 0ms
 ])
+
 NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl
 3 packets transmitted, 3 received, 0% packet loss, time 0ms
 ])
@@ -255,6 +256,10 @@  NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PI
 3 packets transmitted, 3 received, 0% packet loss, time 0ms
 ])
 
+NETNS_DAEMONIZE([at_ns0], [iperf -s], [iperf.pid])
+AT_CHECK([ethtool -K br0 tso off &> /dev/null], [0])
+AT_CHECK([iperf -c 10.1.1.1 -t1 1> /dev/null], [0])
+
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
@@ -571,6 +576,10 @@  NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PI
 3 packets transmitted, 3 received, 0% packet loss, time 0ms
 ])
 
+NETNS_DAEMONIZE([at_ns0], [iperf -s], [iperf.pid])
+AT_CHECK([ethtool -K br0 tso off &> /dev/null], [0])
+AT_CHECK([iperf -c 10.1.1.1 -t1 1> /dev/null], [0])
+
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP