diff mbox series

[ovs-dev,RFC,1/2] netdev-dpdk: Consider packets marked for TSO.

Message ID 1533742768-204340-2-git-send-email-tiago.lam@intel.com
State RFC
Delegated to: Ian Stokes
Headers show
Series dpdk: Add support for TSO | expand

Commit Message

Lam, Tiago Aug. 8, 2018, 3:39 p.m. UTC
Previously, TSO was being explicity disabled on vhost interfaces,
meaning the guests wouldn't have TSO support negotiated in. With TSO
negotiated and enabled, packets are now marked for TSO, through the
PKT_TX_TCP_SEG flag.

In order to deal with this type of packets, a new function,
netdev_dpdk_prep_tso_packet(), has been introduced, with the main
purpose of setting correctly the l2, l3 and l4 length members of the
mbuf struct, and the appropriate ol_flags. This function supports TSO
both in IPv4 and IPv6.

netdev_dpdk_prep_tso_packet() is then only called when packets are
marked with the PKT_TX_TCP_SEG flag, meaning they have been marked for
TSO, and when the packet will be traversing the NIC.

Co-authored-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

Signed-off-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
Signed-off-by: Tiago Lam <tiago.lam@intel.com>
---
 lib/dp-packet.c   |   5 ++-
 lib/netdev-dpdk.c | 120 +++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 96 insertions(+), 29 deletions(-)
diff mbox series

Patch

diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 6773535..412c553 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -56,7 +56,6 @@  dp_packet_copy_mbuf_flags(struct dp_packet *dst, const struct dp_packet *src)
     struct rte_mbuf *buf_dst = &(dst->mbuf);
     struct rte_mbuf buf_src = src->mbuf;
 
-    buf_dst->nb_segs = buf_src.nb_segs;
     buf_dst->ol_flags = buf_src.ol_flags;
     buf_dst->packet_type = buf_src.packet_type;
     buf_dst->tx_offload = buf_src.tx_offload;
@@ -184,6 +183,7 @@  dp_packet_clone_with_headroom(const struct dp_packet *b, size_t headroom) {
     /* copy multi-seg data */
     if (b->source == DPBUF_DPDK && b->mbuf.nb_segs > 1) {
         void *dst = NULL;
+        struct rte_mbuf *new_mbuf = NULL;
         struct rte_mbuf *mbuf = CONST_CAST(struct rte_mbuf *, &b->mbuf);
 
         new_buffer = dp_packet_new_with_headroom(pkt_len, headroom);
@@ -193,6 +193,9 @@  dp_packet_clone_with_headroom(const struct dp_packet *b, size_t headroom) {
         if (!rte_pktmbuf_read(mbuf, 0, pkt_len, dst)) {
             return NULL;
         }
+
+        new_mbuf = CONST_CAST(struct rte_mbuf *, &new_buffer->mbuf);
+        new_mbuf->nb_segs = 1;
     } else {
         new_buffer = dp_packet_clone_data_with_headroom(dp_packet_data(b),
                                                         dp_packet_size(b),
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index b18b768..5da5996 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -28,6 +28,8 @@ 
 
 #include <rte_bus_pci.h>
 #include <rte_config.h>
+#include "rte_ip.h"
+#include "rte_tcp.h"
 #include <rte_cycles.h>
 #include <rte_errno.h>
 #include <rte_eth_ring.h>
@@ -1375,16 +1377,6 @@  netdev_dpdk_vhost_construct(struct netdev *netdev)
         goto out;
     }
 
-    err = rte_vhost_driver_disable_features(dev->vhost_id,
-                                1ULL << VIRTIO_NET_F_HOST_TSO4
-                                | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                | 1ULL << VIRTIO_NET_F_CSUM);
-    if (err) {
-        VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
-                 "port: %s\n", name);
-        goto out;
-    }
-
     err = rte_vhost_driver_start(dev->vhost_id);
     if (err) {
         VLOG_ERR("rte_vhost_driver_start failed for vhost user "
@@ -2019,6 +2011,57 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+/* Should only be called if PKT_TX_TCP_SEG is set in ol_flags.
+ * Furthermore, it also sets the PKT_TX_TCP_CKSUM and PKT_TX_IP_CKSUM flags,
+ * and PKT_TX_IPV4 and PKT_TX_IPV6 in case the packet is IPv4 or IPv6,
+ * respectiveoly. */
+static void
+netdev_dpdk_prep_tso_packet(struct rte_mbuf *mbuf, int mtu)
+{
+    struct dp_packet *pkt;
+    struct tcp_header *th;
+    struct ether_hdr *m_eth_hdr;
+    struct tcp_hdr *m_tcp_hdr;
+    char *m_l3_hdr;
+
+    pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
+    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
+    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
+    th = dp_packet_l4(pkt);
+    /* There's no layer 4 in the packet */
+    if (!th) {
+        return;
+    }
+    mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
+    mbuf->outer_l2_len = 0;
+    mbuf->outer_l3_len = 0;
+
+    if (!(mbuf->ol_flags & PKT_TX_TCP_SEG)) {
+        return;
+    }
+
+    m_eth_hdr = rte_pktmbuf_mtod(mbuf, struct ether_hdr *);
+    m_l3_hdr = (char *) m_eth_hdr + mbuf->l2_len;
+    m_tcp_hdr = (struct tcp_hdr *) ((char *) m_l3_hdr + mbuf->l3_len);
+
+    mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
+    mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+
+    /* Set the size of each TCP segment, based on the MTU of the device */
+    mbuf->tso_segsz = mtu - mbuf->l3_len - mbuf->l4_len;
+
+    if (mbuf->ol_flags & PKT_TX_IPV4) {
+        /* IPv4 packet */
+        struct ipv4_hdr *ipv4_hdr = (struct ipv4_hdr *) m_l3_hdr;
+        ipv4_hdr->hdr_checksum = 0;
+        m_tcp_hdr->cksum = (rte_ipv4_phdr_cksum(ipv4_hdr, mbuf->ol_flags));
+    } else {
+        /* IPv6 packet */
+        struct ipv6_hdr *ipv6_hdr = (struct ipv6_hdr *) m_l3_hdr;
+        m_tcp_hdr->cksum = (rte_ipv6_phdr_cksum(ipv6_hdr, mbuf->ol_flags));
+    }
+}
+
 /* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
  * 'pkts', even in case of failure.
  *
@@ -2300,13 +2343,29 @@  netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
     int cnt = 0;
     struct rte_mbuf *pkt;
 
+    /* Filter oversized packets, unless are marked for TSO. */
     for (i = 0; i < pkt_cnt; i++) {
         pkt = pkts[i];
+
         if (OVS_UNLIKELY(pkt->pkt_len > dev->max_packet_len)) {
-            VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " max_packet_len %d",
-                         dev->up.name, pkt->pkt_len, dev->max_packet_len);
-            rte_pktmbuf_free(pkt);
-            continue;
+            if (!(pkt->ol_flags & PKT_TX_TCP_SEG)) {
+                VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " "
+                             "max_packet_len %d",
+                             dev->up.name, pkt->pkt_len, dev->max_packet_len);
+                rte_pktmbuf_free(pkt);
+                continue;
+            } else {
+                if (dev->type != DPDK_DEV_VHOST) {
+                    netdev_dpdk_prep_tso_packet(pkt, dev->mtu);
+                }
+
+                /* Else the frames will not actually traverse the NIC, but
+                 * rather travel between VMs on the same host. */
+            }
+        } else {
+            if (dev->type != DPDK_DEV_VHOST) {
+                netdev_dpdk_prep_tso_packet(pkt, dev->mtu);
+            }
         }
 
         if (OVS_UNLIKELY(i != cnt)) {
@@ -2430,6 +2489,12 @@  dpdk_copy_dp_packet_to_mbuf(struct dp_packet *packet, struct rte_mbuf **head,
     fmbuf->nb_segs = nb_segs;
     fmbuf->pkt_len = size;
 
+    struct dp_packet *pkt = CONTAINER_OF(fmbuf, struct dp_packet, mbuf);
+    pkt->l2_pad_size = packet->l2_pad_size;
+    pkt->l2_5_ofs = packet->l2_5_ofs;
+    pkt->l3_ofs = packet->l3_ofs;
+    pkt->l4_ofs = packet->l4_ofs;
+
     dp_packet_mbuf_write(fmbuf, 0, size, dp_packet_data(packet));
 
     return 0;
@@ -2464,14 +2529,17 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
 
     for (i = 0; i < cnt; i++) {
         struct dp_packet *packet = batch->packets[i];
+        struct rte_mbuf *pkt = &batch->packets[i]->mbuf;
         uint32_t size = dp_packet_size(packet);
         int err = 0;
 
         if (OVS_UNLIKELY(size > dev->max_packet_len)) {
-            VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
-                         size, dev->max_packet_len);
-            dropped++;
-            continue;
+            if (!(pkt->ol_flags & PKT_TX_TCP_SEG)) {
+                VLOG_WARN_RL(&rl, "Too big size %u max_packet_len %d",
+                             size, dev->max_packet_len);
+                dropped++;
+                continue;
+            }
         }
 
         err = dpdk_copy_dp_packet_to_mbuf(packet, &pkts[txcnt],
@@ -2487,6 +2555,12 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
         }
         dp_packet_copy_mbuf_flags((struct dp_packet *)pkts[txcnt], packet);
 
+        if (dev->type != DPDK_DEV_VHOST) {
+            /* If packet is non-DPDK, at the very least, we need to update the
+             * mbuf length members, even if TSO is not to be performed. */
+            netdev_dpdk_prep_tso_packet(pkts[txcnt], dev->mtu);
+        }
+
         txcnt++;
     }
 
@@ -4137,16 +4211,6 @@  netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
             goto unlock;
         }
 
-        err = rte_vhost_driver_disable_features(dev->vhost_id,
-                                    1ULL << VIRTIO_NET_F_HOST_TSO4
-                                    | 1ULL << VIRTIO_NET_F_HOST_TSO6
-                                    | 1ULL << VIRTIO_NET_F_CSUM);
-        if (err) {
-            VLOG_ERR("rte_vhost_driver_disable_features failed for vhost user "
-                     "client port: %s\n", dev->up.name);
-            goto unlock;
-        }
-
         err = rte_vhost_driver_start(dev->vhost_id);
         if (err) {
             VLOG_ERR("rte_vhost_driver_start failed for vhost user "