diff mbox series

[ovs-dev,RFC] Enable VXLAN TSO for dpdk datapath

Message ID 20200525090337.110640-1-yang_y_yi@163.com
State RFC
Headers show
Series [ovs-dev,RFC] Enable VXLAN TSO for dpdk datapath | expand

Commit Message

yang_y_yi May 25, 2020, 9:03 a.m. UTC
From: Yi Yang <yangyi01@inspur.com>

This patch just show how VXLAN TSO works for developers,
it isn't ready for merge, welcome comments.

Signed-off-by: Yi Yang <yangyi01@inspur.com>
---
 lib/dp-packet.h    |  33 +++++++++++
 lib/netdev-dpdk.c  | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 lib/netdev-linux.c |  20 +++++++
 lib/netdev.c       |  16 ++---
 4 files changed, 220 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 0430cca..1ed5eba 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -1032,6 +1032,39 @@  dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
     *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
 }
 
+/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
+static inline void
+dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
+{
+    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
+    b->mbuf.l2_len += sizeof(struct udp_header) +
+                      sizeof(struct vxlanhdr);
+    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
+    b->mbuf.outer_l3_len = IP_HEADER_LEN;
+}
+
+/* Set l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
+{
+    b->mbuf.l2_len = l2_len;
+}
+
+/* Set l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
+{
+    b->mbuf.l3_len = l3_len;
+}
+
+/* Set l4_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
+{
+    b->mbuf.l4_len = l4_len;
+}
+
+
 static inline bool
 dp_packet_ip_checksum_valid(const struct dp_packet *p)
 {
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 44ebf96..bd9696d 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -44,6 +44,7 @@ 
 #include <rte_pci.h>
 #include <rte_version.h>
 #include <rte_vhost.h>
+#include <rte_ip.h>
 
 #include "cmap.h"
 #include "coverage.h"
@@ -405,6 +406,7 @@  enum dpdk_hw_ol_features {
     NETDEV_RX_HW_SCATTER = 1 << 2,
     NETDEV_TX_TSO_OFFLOAD = 1 << 3,
     NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
+    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
 };
 
 /*
@@ -988,6 +990,12 @@  dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
 
     if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
         conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
+        /* Enable VXLAN TSO support if available */
+        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
+        }
         if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
             conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
         }
@@ -1126,6 +1134,10 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
         if ((info.tx_offload_capa & tx_tso_offload_capa)
             == tx_tso_offload_capa) {
             dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
+            /* Enable VXLAN TSO support if available */
+            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
+                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+            }
             if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
                 dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
             } else {
@@ -2131,6 +2143,57 @@  netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq)
     rte_free(rx);
 }
 
+/* Prepare the packet for VXLAN HWOL.
+ * Return True if the packet is OK to continue. */
+static void
+netdev_dpdk_prep_vxlan_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
+{
+    struct rte_ether_hdr *eth_hdr =
+                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
+    struct rte_ipv4_hdr *ip_hdr;
+    struct rte_udp_hdr *udp_hdr;
+    struct rte_tcp_hdr *tcp_hdr;
+    uint16_t orig_tcp_csum;
+
+    /* Clean up offload flags from Rx side */
+    mbuf->ol_flags &= ~PKT_RX_RSS_HASH;
+
+    if ((mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) == 0) {
+        return;
+    }
+
+    if (mbuf->ol_flags & PKT_TX_IPV4) {
+        ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+        udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
+
+        /* outer IP checksum */
+        ip_hdr->hdr_checksum = 0;
+        mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
+        mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
+
+        /* inner IP checksum */
+        mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        ip_hdr = (struct rte_ipv4_hdr *)((uint8_t *)udp_hdr + mbuf->l2_len);
+        ip_hdr->hdr_checksum = 0;
+    }
+
+    if (mbuf->ol_flags & PKT_TX_TCP_SEG || mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
+        ip_hdr = (struct rte_ipv4_hdr *)((uint8_t *)rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *) + mbuf->outer_l2_len + mbuf->outer_l3_len + mbuf->l2_len);
+        tcp_hdr = (struct rte_tcp_hdr *)((uint8_t *)ip_hdr + mbuf->l3_len);
+
+        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
+        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
+            //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
+            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
+        } else {
+            /* For non-TSO packet, l4_len isn't required */
+            mbuf->tso_segsz = 0;
+            //mbuf->l4_len = 0;
+        }
+    }
+
+}
+
 /* Prepare the packet for HWOL.
  * Return True if the packet is OK to continue. */
 static bool
@@ -2159,6 +2222,9 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
         mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
 
         if (mbuf->ol_flags & PKT_TX_IPV4) {
+            struct ip_header *ip_hdr = dp_packet_l3(pkt);
+
+            ip_hdr->ip_csum = 0;
             mbuf->ol_flags |= PKT_TX_IP_CKSUM;
         }
     }
@@ -2737,13 +2803,97 @@  dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
 
     mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
     mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
-    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
-                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
+    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
+    //mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
+    //                        ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
+    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
+    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
+    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
+    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
+    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
+
+    memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
+           sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
+
+    if ((mbuf_dest->outer_l2_len == 0) &&
+        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
+        mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
+                                - (char *)dp_packet_eth(pkt_dest);
+        mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
+                                - (char *) dp_packet_l3(pkt_dest);
+    }
+
+    return pkt_dest;
+}
+
+static struct dp_packet *
+dpdk_copy_dp_packet_to_chained_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig, int mbuf_len)
+{
+    struct rte_mbuf *mbuf_dest;
+    struct dp_packet *pkt_dest;
+    uint32_t pkt_len;
+
+    pkt_len = dp_packet_size(pkt_orig);
+    if (pkt_len <= mbuf_len) { //Single rte_mbuf
+        mbuf_dest = rte_pktmbuf_alloc(mp);
+        if (OVS_UNLIKELY(mbuf_dest == NULL)) {
+            return NULL;
+        }
+
+        pkt_dest = CONTAINER_OF(mbuf_dest, struct dp_packet, mbuf);
+        memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig), pkt_len);
+        dp_packet_set_size(pkt_dest, pkt_len);
+    } else { // Chained multi-segmented rte_mbuf
+        struct rte_mbuf * mbufs[48];
+        int ret;
+        int count;
+        int i;
+        uint32_t pkt_off = 0;
+        uint32_t seg_len = mbuf_len;
+        uint32_t left_len = pkt_len;
+
+        count = pkt_len / mbuf_len + ((pkt_len % mbuf_len) ? 1 : 0);
+        ret = rte_pktmbuf_alloc_bulk(mp, mbufs, count);
+        if (OVS_UNLIKELY(ret != 0)) {
+            return NULL;
+        }
+
+        mbuf_dest = mbufs[0];
+        for (i = 0; i < count; i++) {
+            pkt_dest = CONTAINER_OF(mbufs[i], struct dp_packet, mbuf);
+            memcpy(dp_packet_data(pkt_dest), dp_packet_data(pkt_orig) + pkt_off, seg_len);
+            mbufs[i]->nb_segs = 1;
+            mbufs[i]->next = NULL;
+            dp_packet_set_size(pkt_dest, seg_len);
+            pkt_off += seg_len;
+            left_len -= seg_len;
+            if (left_len < mbuf_len) {
+                seg_len = left_len;
+            }
+            if (i > 0) {
+                mbufs[i-1]->next =  mbufs[i];
+                mbuf_dest->nb_segs += 1;
+                mbuf_dest->pkt_len += mbufs[i]->pkt_len;
+            }
+        }
+    }
+
+    mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
+    mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
+    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
+    //mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
+    //                        ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
+    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
+    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
+    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
+    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
+    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
 
     memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
            sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
 
-    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
+    if ((mbuf_dest->outer_l2_len == 0) &&
+        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
         mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
                                 - (char *)dp_packet_eth(pkt_dest);
         mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
@@ -2753,6 +2903,7 @@  dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
     return pkt_dest;
 }
 
+
 /* Tx function. Transmit packets indefinitely */
 static void
 dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
@@ -2773,6 +2924,7 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
     uint32_t tx_failure = 0;
     uint32_t mtu_drops = 0;
     uint32_t qos_drops = 0;
+    struct rte_mbuf *mbuf;
 
     if (dev->type != DPDK_DEV_VHOST) {
         /* Check if QoS has been configured for this netdev. */
@@ -2795,12 +2947,15 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
             continue;
         }
 
-        pkts[txcnt] = dpdk_copy_dp_packet_to_mbuf(dev->dpdk_mp->mp, packet);
+        pkts[txcnt] = dpdk_copy_dp_packet_to_chained_mbuf(dev->dpdk_mp->mp, packet, dev->mtu + RTE_ETHER_HDR_LEN);
         if (OVS_UNLIKELY(!pkts[txcnt])) {
             dropped = cnt - i;
             break;
         }
 
+        mbuf = (struct rte_mbuf *)pkts[txcnt];
+        netdev_dpdk_prep_vxlan_hwol_packet(dev, mbuf);
+
         txcnt++;
     }
 
@@ -4949,6 +5104,10 @@  netdev_dpdk_reconfigure(struct netdev *netdev)
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
+        /* Enable VXLAN TSO support if available */
+        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+        }
         if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
             netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
         }
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 6269c24..f6e80fc 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -6500,6 +6500,8 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
     struct eth_header *eth_hdr;
     ovs_be16 eth_type;
     int l2_len;
+    int l3_len = 0;
+    int l4_len = 0;
 
     eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
     if (!eth_hdr) {
@@ -6519,6 +6521,8 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
         l2_len += VLAN_HEADER_LEN;
     }
 
+    dp_packet_hwol_set_l2_len(b, l2_len);
+
     if (eth_type == htons(ETH_TYPE_IP)) {
         struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
 
@@ -6526,6 +6530,7 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
             return -EINVAL;
         }
 
+        l3_len = IP_HEADER_LEN;
         *l4proto = ip_hdr->ip_proto;
         dp_packet_hwol_set_tx_ipv4(b);
     } else if (eth_type == htons(ETH_TYPE_IPV6)) {
@@ -6536,10 +6541,25 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
             return -EINVAL;
         }
 
+        l3_len = IPV6_HEADER_LEN;
         *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
         dp_packet_hwol_set_tx_ipv6(b);
     }
 
+    dp_packet_hwol_set_l3_len(b, l3_len);
+
+    if (*l4proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr =  dp_packet_at(b, l2_len + l3_len,
+                                          sizeof(struct tcp_header));
+
+        if (!tcp_hdr) {
+            return -EINVAL;
+        }
+
+        l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
+        dp_packet_hwol_set_l4_len(b, l4_len);
+    }
+
     return 0;
 }
 
diff --git a/lib/netdev.c b/lib/netdev.c
index 90962ee..dbc130b 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -960,18 +960,10 @@  netdev_push_header(const struct netdev *netdev,
     size_t i, size = dp_packet_batch_size(batch);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
-                         || dp_packet_hwol_l4_mask(packet))) {
-            COVERAGE_INC(netdev_push_header_drops);
-            dp_packet_delete(packet);
-            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
-                         "not supported: packet dropped",
-                         netdev_get_name(netdev));
-        } else {
-            netdev->netdev_class->push_header(netdev, packet, data);
-            pkt_metadata_init(&packet->md, data->out_port);
-            dp_packet_batch_refill(batch, packet, i);
-        }
+        netdev->netdev_class->push_header(netdev, packet, data);
+        dp_packet_hwol_set_vxlan_tcp_seg(packet);
+        pkt_metadata_init(&packet->md, data->out_port);
+        dp_packet_batch_refill(batch, packet, i);
     }
 
     return 0;