diff mbox series

[ovs-dev,RFC] dp-packet-gso: Add GSO support.

Message ID 1612921064-45294-1-git-send-email-u9012063@gmail.com
State RFC
Headers show
Series [ovs-dev,RFC] dp-packet-gso: Add GSO support. | expand

Commit Message

William Tu Feb. 10, 2021, 1:37 a.m. UTC
This patch adds GSO support for IPv4 TCP, when userspace-tso is enabled.
Tested using veth sending a TSO packet to OVS, segments to smaller TCP
segment, and forward to netdev-afxdp port at another namespace.

Future work includes:
1. GSO for UDP, and IPv6 TCP/UDP GSO.
2. Tunnel GSO: VxLan GSO, Geneve GSO, GRE GSO...

Tested using
$ make check-afxdp TESTSUITEFLAGS='3'

Or script below:
  ovs-vsctl set Open_vSwitch . other_config:userspace-tso-enable=true
  ovs-vsctl -- add-br br0 -- set Bridge br0 datapath_type=netdev
  ip netns add at_ns0
  ip link add p0 type veth peer name afxdp-p0
  ip link set p0 netns at_ns0
  ip link set dev afxdp-p0 up
  ovs-vsctl add-port br0 afxdp-p0
  ip netns exec at_ns0 sh << NS_EXEC_HEREDOC
  ip addr add "10.1.1.1/24" dev p0
  ip link set dev p0 up
  NS_EXEC_HEREDOC

  ip netns add at_ns1
  ip link add p1 type veth peer name afxdp-p1
  ip link set p1 netns at_ns1
  ip link set dev afxdp-p1 up
  ovs-vsctl add-port br0 afxdp-p1 -- set int afxdp-p1 type=afxdp

  ip netns exec at_ns1 sh << NS_EXEC_HEREDOC
  ip addr add "10.1.1.2/24" dev p1
  ip link set dev p1 up
  NS_EXEC_HEREDOC

  ip netns exec at_ns0 ping -c 3 -i .2 10.1.1.2
  ip netns exec at_ns1 ethtool -K p1 tx off
  ip netns exec at_ns1 iperf -s
  ip netns exec at_ns0 iperf -c 10.1.1.2 -t1

Tested-at: https://github.com/williamtu/ovs-travis/actions/runs/553156643
Signed-off-by: William Tu <u9012063@gmail.com>
---
 lib/automake.mk       |   2 +
 lib/dp-packet-gso.c   | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/dp-packet-gso.h   |  27 +++++++++
 lib/netdev-afxdp.c    |   6 ++
 lib/netdev.c          |  88 +++++++++++++++++++++++------
 lib/packets.c         |  35 ++++++++++++
 lib/packets.h         |   1 +
 tests/system-afxdp.at |  32 +++++++++++
 8 files changed, 324 insertions(+), 16 deletions(-)
 create mode 100644 lib/dp-packet-gso.c
 create mode 100644 lib/dp-packet-gso.h
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 39afbff9d1a0..57f504d52f5c 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -104,6 +104,8 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dpctl.h \
 	lib/dp-packet.h \
 	lib/dp-packet.c \
+	lib/dp-packet-gso.h \
+	lib/dp-packet-gso.c \
 	lib/dpdk.h \
 	lib/dpif-netdev-lookup.h \
 	lib/dpif-netdev-lookup.c \
diff --git a/lib/dp-packet-gso.c b/lib/dp-packet-gso.c
new file mode 100644
index 000000000000..5ae7c88298a5
--- /dev/null
+++ b/lib/dp-packet-gso.c
@@ -0,0 +1,149 @@ 
+/*
+ * Copyright (c) 2021 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <config.h>
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "csum.h"
+#include "dp-packet.h"
+#include "dp-packet-gso.h"
+#include "dpif-netdev.h"
+#include "openvswitch/compiler.h"
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/vlog.h"
+#include "packets.h"
+#include "util.h"
+
+VLOG_DEFINE_THIS_MODULE(dp_packet_gso);
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
+/* Update ip header's total len, and id and update tcp header's
+ * sent sequence number.  In the end, update ip and tcp csum.
+ */
+static void
+update_ipv4_tcp_headers(const struct dp_packet *src, struct dp_packet **pkts,
+                        uint16_t nb_segs)
+{
+    struct tcp_header *tcp;
+    struct ip_header *ip;
+    struct dp_packet *p;
+    uint32_t tcp_seq;
+    uint16_t ipid;
+    int i;
+
+    ip = dp_packet_l3(src);
+    ipid = ntohs(ip->ip_id);
+    tcp = dp_packet_l4(src);
+    tcp_seq = ntohl(get_16aligned_be32(&tcp->tcp_seq));
+
+    for (i = 0; i < nb_segs; i++) {
+        p = pkts[i];
+
+        ip = dp_packet_l3(p);
+        ip->ip_tot_len = htons(dp_packet_l3_size(p));
+        ip->ip_id = htons(ipid);
+        ip->ip_csum = 0;
+        ip->ip_csum = csum(ip, sizeof *ip);
+
+        tcp = dp_packet_l4(p);
+        put_16aligned_be32(&tcp->tcp_seq, htonl(tcp_seq));
+        packet_csum_tcpudp(p);
+
+        ipid += 1;
+        tcp_seq += (const char *) dp_packet_tail(p) -
+                   (const char *) dp_packet_l4(p) -
+                   TCP_OFFSET(tcp->tcp_ctl) * 4;
+    }
+}
+
+static void
+hdr_segment_init(struct dp_packet *dst, const struct dp_packet *src)
+{
+    /* Copy the following fields into the returned buffer: l2_pad_size,
+     * l2_5_ofs, l3_ofs, l4_ofs, cutlen, packet_type and md. */
+    memcpy(&dst->l2_pad_size, &src->l2_pad_size,
+           sizeof(struct dp_packet) -
+           offsetof(struct dp_packet, l2_pad_size));
+
+    *dp_packet_ol_flags_ptr(dst) = 0;
+}
+
+static int
+gso_do_segment(const struct dp_packet *p, uint16_t hdr_offset,
+               uint16_t pyld_unit_size, struct dp_packet **pout,
+               uint16_t nb_pout)
+{
+    uint16_t nb_segs = 0;
+    struct dp_packet *pkt;
+    uint16_t seg_size;
+    uint16_t pos = hdr_offset;
+    int bytes_remaining = dp_packet_size(p) - hdr_offset;
+
+    while (bytes_remaining > 0) {
+
+        seg_size = (bytes_remaining >= pyld_unit_size) ?
+                   pyld_unit_size : bytes_remaining;
+
+        /* Create a new dp_packet, put payload, push header. */
+        pkt = dp_packet_new_with_headroom(seg_size, hdr_offset);
+        hdr_segment_init(pkt, p);
+        dp_packet_put(pkt, (char *) dp_packet_data(p) + pos, seg_size);
+        dp_packet_push(pkt, dp_packet_data(p), hdr_offset);
+
+        pos += seg_size;
+        bytes_remaining -= seg_size;
+        pout[nb_segs] = pkt;
+        nb_segs++;
+
+        if (nb_segs > nb_pout) {
+            VLOG_WARN_RL(&rl, "Not enough memory to process GSO.");
+            nb_segs = -1;
+            /* need to free dp_packet. */
+            break;
+        }
+    }
+    return nb_segs;
+}
+
+int
+gso_tcp4_segment(struct dp_packet *p, uint16_t gso_size,
+                 struct dp_packet **pout, uint16_t nb_pout)
+{
+    uint16_t pyld_unit_size, hdr_offset;
+    int nb_segs;
+
+    hdr_offset = (char *) dp_packet_get_tcp_payload(p) -
+                 (char *) dp_packet_eth(p);
+    pyld_unit_size = gso_size - hdr_offset;
+
+    if (OVS_UNLIKELY(dp_packet_size(p) < ETH_PAYLOAD_MAX)) {
+        VLOG_WARN_RL(&rl, "Packet size %u bytes too small for GSO.",
+                     dp_packet_size(p));
+        return -EINVAL;
+    }
+
+    nb_segs = gso_do_segment(p, hdr_offset, pyld_unit_size, pout, nb_pout);
+    if (nb_segs > 0) {
+        /* Update TCP checksum. */
+        update_ipv4_tcp_headers(p, pout, nb_segs);
+    }
+
+    return nb_segs;
+}
diff --git a/lib/dp-packet-gso.h b/lib/dp-packet-gso.h
new file mode 100644
index 000000000000..d33d904c9e22
--- /dev/null
+++ b/lib/dp-packet-gso.h
@@ -0,0 +1,27 @@ 
+/*
+ * Copyright (c) 2021 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DP_PACKET_GSO_H
+#define DP_PACKET_GSO_H 1
+
+#include <stdint.h>
+#include <stdbool.h>
+
+int gso_tcp4_segment(struct dp_packet *p, uint16_t gso_size,
+                     struct dp_packet **pouts, uint16_t nb_pouts);
+int gso_udp4_segment(struct dp_packet *p, uint16_t gso_size,
+                     struct dp_packet **pouts, uint16_t nb_pouts);
+#endif /* dp-packet-gso.h */
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index 482400d8d135..bf7b85d73a63 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -47,6 +47,7 @@ 
 #include "ovs-numa.h"
 #include "packets.h"
 #include "socket-util.h"
+#include "userspace-tso.h"
 #include "util.h"
 
 #ifndef SOL_XDP
@@ -867,6 +868,7 @@  netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
                             FRAME_SIZE - FRAME_HEADROOM,
                             OVS_XDP_HEADROOM);
         dp_packet_set_size(packet, len);
+        *dp_packet_ol_flags_ptr(packet) = 0;
 
         /* Add packet into batch, increase batch->count. */
         dp_packet_batch_add(batch, packet);
@@ -1187,6 +1189,10 @@  netdev_afxdp_construct(struct netdev *netdev)
     dev->xsks = NULL;
     dev->tx_locks = NULL;
 
+    if (userspace_tso_enabled()) {
+        netdev->ol_flags = 0;
+    }
+
     netdev_request_reconfigure(netdev);
     return 0;
 }
diff --git a/lib/netdev.c b/lib/netdev.c
index 91e91955c09b..691ce81a01be 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -34,6 +34,7 @@ 
 #include "cmap.h"
 #include "coverage.h"
 #include "dpif.h"
+#include "dp-packet-gso.h"
 #include "dp-packet.h"
 #include "openvswitch/dynamic-string.h"
 #include "fatal-signal.h"
@@ -797,7 +798,6 @@  netdev_send_prepare_packet(const uint64_t netdev_flags,
     if (dp_packet_hwol_is_tso(packet)
         && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
             /* Fall back to GSO in software. */
-            VLOG_ERR_BUF(errormsg, "No TSO support");
             return false;
     }
 
@@ -806,8 +806,8 @@  netdev_send_prepare_packet(const uint64_t netdev_flags,
         if (dp_packet_hwol_l4_is_tcp(packet)) {
             if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
                 /* Fall back to TCP csum in software. */
-                VLOG_ERR_BUF(errormsg, "No TCP checksum support");
-                return false;
+                packet_csum_tcpudp(packet);
+                return true;
             }
         } else if (dp_packet_hwol_l4_is_udp(packet)) {
             if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) {
@@ -835,7 +835,8 @@  netdev_send_prepare_packet(const uint64_t netdev_flags,
  * otherwise either fall back to software implementation or drop it. */
 static void
 netdev_send_prepare_batch(const struct netdev *netdev,
-                          struct dp_packet_batch *batch)
+                          struct dp_packet_batch *batch,
+                          struct dp_packet_batch *gso_batch)
 {
     struct dp_packet *packet;
     size_t i, size = dp_packet_batch_size(batch);
@@ -846,11 +847,16 @@  netdev_send_prepare_batch(const struct netdev *netdev,
         if (netdev_send_prepare_packet(netdev->ol_flags, packet, &errormsg)) {
             dp_packet_batch_refill(batch, packet, i);
         } else {
-            dp_packet_delete(packet);
-            COVERAGE_INC(netdev_send_prepare_drops);
-            VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
-                         netdev_get_name(netdev), errormsg);
-            free(errormsg);
+            if (dp_packet_hwol_is_tso(packet) &&
+                !(netdev->ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+                dp_packet_batch_add(gso_batch, packet);
+            } else {
+                dp_packet_delete(packet);
+                COVERAGE_INC(netdev_send_prepare_drops);
+                VLOG_WARN_RL(&rl, "%s: Packet dropped: %s",
+                             netdev_get_name(netdev), errormsg);
+                free(errormsg);
+            }
         }
     }
 }
@@ -884,17 +890,67 @@  int
 netdev_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch,
             bool concurrent_txq)
 {
-    int error;
+    struct dp_packet_batch *gso_batch_ptr;
+    struct dp_packet_batch gso_batch;
+    struct dp_packet **gso_pkts;
+    struct dp_packet *packet;
+    uint16_t gso_pkts_len, nb_segs;
+    int error = 0;
 
-    netdev_send_prepare_batch(netdev, batch);
-    if (OVS_UNLIKELY(dp_packet_batch_is_empty(batch))) {
-        return 0;
+    dp_packet_batch_init(&gso_batch);
+    netdev_send_prepare_batch(netdev, batch, &gso_batch);
+
+    if (!dp_packet_batch_is_empty(batch)) {
+        error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
+        if (!error) {
+            COVERAGE_INC(netdev_sent);
+        }
     }
 
-    error = netdev->netdev_class->send(netdev, qid, batch, concurrent_txq);
-    if (!error) {
-        COVERAGE_INC(netdev_sent);
+    if (dp_packet_batch_is_empty(&gso_batch)) {
+        return error;
     }
+    gso_batch_ptr = &gso_batch;
+    DP_PACKET_BATCH_FOR_EACH (i, packet, gso_batch_ptr) {
+        struct dp_packet_batch seg_batch;
+        uint16_t gso_size = 1000; /* How to decide gso_size? */
+
+        gso_pkts_len = 2 * NETDEV_MAX_BURST;
+        gso_pkts = xmalloc(gso_pkts_len * sizeof(struct dp_packet *));
+
+        nb_segs = gso_tcp4_segment(packet, gso_size, gso_pkts, gso_pkts_len);
+        if (nb_segs <= 0) {
+            VLOG_WARN("GSO tcp4 segment failed");
+            dp_packet_delete_batch(gso_batch_ptr, true);
+            return EINVAL;
+        }
+        dp_packet_batch_init(&seg_batch);
+
+        for (i = 0; i < nb_segs; i++) {
+            dp_packet_batch_add(&seg_batch, gso_pkts[i]);
+
+            if (dp_packet_batch_is_full(&seg_batch)) {
+                /* Send the first batch when full. */
+                error = netdev->netdev_class->send(netdev, qid, &seg_batch,
+                                                   concurrent_txq);
+                if (!error) {
+                    COVERAGE_INC(netdev_sent);
+                }
+                dp_packet_batch_init(&seg_batch);
+            }
+        }
+        if (!dp_packet_batch_is_empty(&seg_batch)) {
+            /* Send the rest. */
+            error = netdev->netdev_class->send(netdev, qid, &seg_batch,
+                                               concurrent_txq);
+            if (!error) {
+                COVERAGE_INC(netdev_sent);
+            }
+        }
+
+    }
+    free(gso_pkts);
+
     return error;
 }
 
diff --git a/lib/packets.c b/lib/packets.c
index 4a7643c5dd3a..20702d25c2af 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -1887,3 +1887,38 @@  IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6)
         }
     }
 }
+
+void
+packet_csum_tcpudp(struct dp_packet *p)
+{
+    struct eth_header *eth;
+    struct ip_header *ip;
+    struct tcp_header *tcp;
+    struct udp_header *udp;
+    uint32_t pseudo_hdr_csum;
+    uint8_t l4proto;
+    size_t l4_size;
+
+    eth = dp_packet_eth(p);
+    if (eth->eth_type != htons(ETH_TYPE_IP)) {
+        return;
+    }
+
+    ip = dp_packet_l3(p);
+    l4proto = ip->ip_proto;
+    l4_size = dp_packet_l4_size(p);
+
+    if (l4proto == IPPROTO_TCP) {
+        pseudo_hdr_csum = packet_csum_pseudoheader(ip);
+        tcp = dp_packet_l4(p);
+        tcp->tcp_csum = 0;
+        tcp->tcp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
+                                                  tcp, l4_size));
+    } else if (l4proto == IPPROTO_UDP) {
+        pseudo_hdr_csum = packet_csum_pseudoheader(ip);
+        udp = dp_packet_l4(p);
+        udp->udp_csum = 0;
+        udp->udp_csum = csum_finish(csum_continue(pseudo_hdr_csum,
+                                                  udp, l4_size));
+    }
+}
diff --git a/lib/packets.h b/lib/packets.h
index 481bc22fa1fe..108087f916ac 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1635,6 +1635,7 @@  void packet_put_ra_prefix_opt(struct dp_packet *,
                               const ovs_be128 router_prefix);
 uint32_t packet_csum_pseudoheader(const struct ip_header *);
 void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
+void packet_csum_tcpudp(struct dp_packet *p);
 
 #define DNS_HEADER_LEN 12
 struct dns_header {
diff --git a/tests/system-afxdp.at b/tests/system-afxdp.at
index 0d09906fb6c8..3c6a7708435c 100644
--- a/tests/system-afxdp.at
+++ b/tests/system-afxdp.at
@@ -45,3 +45,35 @@  NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0],
 
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
+
+dnl p0 at at_ns0 sends TSO packet to ovs-p0 at OVS.
+dnl ovs-p1 attached to OVS as type=afxdp
+AT_SETUP([AF_XDP - enable userspace TSO])
+AT_KEYWORDS([afxdp tso])
+OVS_TRAFFIC_VSWITCHD_START()
+
+AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:userspace-tso-enable=true])
+AT_CHECK([ovs-ofctl add-flow br0 "actions=normal"])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+dnl Create and add ovs-p0 as system port
+ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24")
+AT_CHECK([ovs-vsctl del-port ovs-p0])
+AT_CHECK([ovs-vsctl add-port br0 ovs-p0])
+dnl Enable tx offload at p0, so ovs-p0 sees TSO packets
+NS_CHECK_EXEC([at_ns0], [ethtool -K p0 tx on > /dev/null 2>&1])
+
+dnl Create and add ovs-p1 as afxdp port
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+dnl Send a TSO from ns0 to ns1
+NETNS_DAEMONIZE([at_ns1], [iperf -s], [iperf.pid])
+NS_CHECK_EXEC([at_ns0], [iperf -c 10.1.1.2 -t1 1> /dev/null], [0])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP