diff mbox series

[ovs-dev,V3,3/4] Add VXLAN TCP and UDP GRO support for DPDK data path

Message ID 20200807105648.94860-4-yang_y_yi@163.com
State New
Headers show
Series userspace: enable VXLAN TSO, GSO and GRO | expand

Commit Message

yang_y_yi Aug. 7, 2020, 10:56 a.m. UTC
From: Yi Yang <yangyi01@inspur.com>

GRO(Generic Receive Offload) can help improve performance
when TSO (TCP Segment Offload) or VXLAN TSO is enabled
on transmit side, this can avoid overhead of ovs DPDK
data path and enqueue vhost for VM by merging many small
packets to large packets (65535 bytes at most) once it
receives packets from physical NIC.

It can work for both VXLAN and vlan case.

Signed-off-by: Yi Yang <yangyi01@inspur.com>
---
 lib/dp-packet.h    |  37 ++++++++-
 lib/netdev-dpdk.c  | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/netdev-linux.c | 112 ++++++++++++++++++++++++--
 3 files changed, 365 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index c33868d..18307c0 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -580,7 +580,16 @@  dp_packet_set_size(struct dp_packet *b, uint32_t v)
      * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
      * loss of accuracy in assigning 'v' to 'data_len'.
      */
-    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
+    if (b->mbuf.nb_segs <= 1) {
+        b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
+    } else {
+        /* For multi-seg packet, if it is resize, data_len should be
+         * adjusted by offset, this will happend in case of push or pop.
+         */
+        if (b->mbuf.pkt_len != 0) {
+            b->mbuf.data_len += v - b->mbuf.pkt_len;
+        }
+    }
     b->mbuf.pkt_len = v;             /* Total length of all segments linked to
                                       * this segment. */
 }
@@ -1092,6 +1101,20 @@  dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
 {
     b->mbuf.l4_len = l4_len;
 }
+
+/* Set outer_l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len)
+{
+    b->mbuf.outer_l2_len = outer_l2_len;
+}
+
+/* Set outer_l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len)
+{
+    b->mbuf.outer_l3_len = outer_l3_len;
+}
 #else
 /* Mark packet 'b' for VXLAN TCP segmentation offloading. */
 static inline void
@@ -1125,6 +1148,18 @@  dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED,
                           int l4_len OVS_UNUSED)
 {
 }
+
+/* Set outer_l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len)
+{
+}
+
+/* Set outer_l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len)
+{
+}
 #endif /* DPDK_NETDEV */
 
 static inline bool
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 888a45e..b6c57a6 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -25,6 +25,7 @@ 
 #include <linux/virtio_net.h>
 #include <sys/socket.h>
 #include <linux/if.h>
+#include <linux/unistd.h>
 
 /* Include rte_compat.h first to allow experimental API's needed for the
  * rte_meter.h rfc4115 functions. Once they are no longer marked as
@@ -47,6 +48,7 @@ 
 #include <rte_version.h>
 #include <rte_vhost.h>
 #include <rte_ip_frag.h>
+#include <rte_gro.h>
 
 #include "cmap.h"
 #include "coverage.h"
@@ -2184,6 +2186,8 @@  get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype)
         }
 }
 
+#define UDP_VXLAN_ETH_HDR_SIZE 30
+
 /* Prepare the packet for HWOL.
  * Return True if the packet is OK to continue. */
 static bool
@@ -2207,6 +2211,42 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
         return true;
     }
 
+    /* ol_flags is cleaned after vxlan pop, so need reset for those packets.
+     * Such packets are only for local VMs or namespaces, so need to return
+     * after ol_flags, l2_len, l3_len and tso_segsz are set.
+     */
+    if (((mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) == 0) &&
+        (mbuf->l2_len == UDP_VXLAN_ETH_HDR_SIZE) &&
+        (mbuf->pkt_len > 1464)) {
+        mbuf->ol_flags = 0;
+        mbuf->l2_len -= sizeof(struct udp_header)
+                        + sizeof(struct vxlanhdr);
+        if (mbuf->l3_len == IP_HEADER_LEN) {
+            mbuf->ol_flags |= PKT_TX_IPV4;
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+            l4_proto = ip_hdr->next_proto_id;
+        } else if (mbuf->l3_len == IPV6_HEADER_LEN) {
+            mbuf->ol_flags |= PKT_TX_IPV6;
+            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+            l4_proto = ip6_hdr->proto;
+        }
+
+        mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        if (l4_proto == IPPROTO_TCP) {
+            mbuf->ol_flags |= PKT_TX_TCP_SEG;
+            mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
+        } else if (l4_proto == IPPROTO_UDP) {
+            mbuf->ol_flags |= PKT_TX_UDP_SEG;
+            mbuf->ol_flags |= PKT_TX_UDP_CKSUM;
+        }
+        mbuf->tso_segsz = 1450;
+        if (mbuf->tso_segsz > dev->mtu) {
+            mbuf->tso_segsz = dev->mtu;
+        }
+
+        return true;
+    }
+
     if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
         /* Handle VXLAN TSO */
         struct rte_udp_hdr *udp_hdr;
@@ -2853,6 +2893,104 @@  netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq)
     return dev->vhost_rxq_enabled[rxq->queue_id];
 }
 
+#define VXLAN_DST_PORT 4789
+
+static void
+netdev_dpdk_parse_hdr(struct dp_packet *pkt, int offset, uint16_t *l4_proto,
+                      int *is_frag)
+{
+    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
+    struct rte_ether_hdr *eth_hdr =
+        rte_pktmbuf_mtod_offset(mbuf, struct rte_ether_hdr *, offset);
+    ovs_be16 eth_type;
+    int l2_len;
+    int l3_len = 0;
+    int l4_len = 0;
+    uint16_t inner_l4_proto = 0;
+    int inner_is_frag = 0;
+
+    if (offset == 0) {
+        *is_frag = 0;
+    }
+    mbuf->packet_type = 0;
+    l2_len = ETH_HEADER_LEN;
+    eth_type = (OVS_FORCE ovs_be16) eth_hdr->ether_type;
+    if (eth_type_vlan(eth_type)) {
+        struct rte_vlan_hdr *vlan_hdr =
+                        (struct rte_vlan_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
+
+        eth_type = (OVS_FORCE ovs_be16) vlan_hdr->eth_proto;
+        l2_len += VLAN_HEADER_LEN;
+    }
+
+    dp_packet_hwol_set_l2_len(pkt, l2_len);
+    dp_packet_hwol_set_outer_l2_len(pkt, 0);
+    dp_packet_hwol_set_outer_l3_len(pkt, 0);
+
+    if (eth_type == htons(ETH_TYPE_IP)) {
+        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
+            ((char *)eth_hdr + l2_len);
+
+        l3_len = IP_HEADER_LEN;
+        dp_packet_hwol_set_tx_ipv4(pkt);
+        *l4_proto = ipv4_hdr->next_proto_id;
+        *is_frag = rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr);
+        mbuf->packet_type |= RTE_PTYPE_L3_IPV4;
+    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
+        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
+            ((char *)eth_hdr + l2_len);
+        l3_len = IPV6_HEADER_LEN;
+        dp_packet_hwol_set_tx_ipv6(pkt);
+        *l4_proto = ipv6_hdr->proto;
+    }
+
+    dp_packet_hwol_set_l3_len(pkt, l3_len);
+
+    if (*l4_proto == IPPROTO_TCP) {
+        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
+            ((char *)eth_hdr + l2_len + l3_len);
+
+        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+        dp_packet_hwol_set_l4_len(pkt, l4_len);
+        mbuf->packet_type |= RTE_PTYPE_L4_TCP;
+    } else if (*l4_proto == IPPROTO_UDP) {
+        struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *)
+            ((char *)eth_hdr + l2_len + l3_len);
+
+        l4_len = sizeof(*udp_hdr);
+        dp_packet_hwol_set_l4_len(pkt, l4_len);
+        mbuf->packet_type |= RTE_PTYPE_L4_UDP;
+
+        /* Need to parse inner packet if needed */
+        if (ntohs(udp_hdr->dst_port) == VXLAN_DST_PORT) {
+            netdev_dpdk_parse_hdr(pkt,
+                                  l2_len + l3_len + l4_len
+                                      + sizeof(struct vxlanhdr),
+                                  &inner_l4_proto,
+                                  &inner_is_frag);
+            mbuf->l2_len += sizeof(struct rte_udp_hdr)
+                                + sizeof(struct vxlanhdr);
+            dp_packet_hwol_set_outer_l2_len(pkt, l2_len);
+            dp_packet_hwol_set_outer_l3_len(pkt, l3_len);
+
+            /* Set packet_type, it is necessary for GRO */
+            mbuf->packet_type |= RTE_PTYPE_TUNNEL_VXLAN;
+            if (mbuf->l3_len == IP_HEADER_LEN) {
+                mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4;
+            }
+            if (inner_l4_proto == IPPROTO_TCP) {
+                mbuf->packet_type |= RTE_PTYPE_INNER_L4_TCP;
+                mbuf->packet_type |= RTE_PTYPE_L4_UDP;
+            } else if (inner_l4_proto == IPPROTO_UDP) {
+                mbuf->packet_type |= RTE_PTYPE_INNER_L4_UDP;
+                mbuf->packet_type |= RTE_PTYPE_L4_UDP;
+            }
+        }
+    }
+}
+
+static RTE_DEFINE_PER_LCORE(void *, _gro_ctx);
+
 static int
 netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
                      int *qfill)
@@ -2862,6 +3000,36 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
     struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev);
     int nb_rx;
     int dropped = 0;
+    struct rte_gro_param gro_param;
+    struct dp_packet *packet;
+    struct dp_packet *udp_pkts[NETDEV_MAX_BURST];
+    struct dp_packet *other_pkts[NETDEV_MAX_BURST];
+
+    int nb_udp_rx = 0;
+    int nb_other_rx = 0;
+
+    /* Initialize GRO parameters */
+    gro_param.gro_types = RTE_GRO_TCP_IPV4 |
+                          RTE_GRO_UDP_IPV4 |
+                          RTE_GRO_IPV4_VXLAN_TCP_IPV4 |
+                          RTE_GRO_IPV4_VXLAN_UDP_IPV4;
+    gro_param.max_flow_num = 1024;
+    /* There are 46 fragments for a 64K big packet */
+    gro_param.max_item_per_flow = NETDEV_MAX_BURST * 2;
+
+    /* Initialize GRO context */
+    if (RTE_PER_LCORE(_gro_ctx) == NULL) {
+        uint32_t cpu, node;
+        int ret;
+
+        ret = syscall(__NR_getcpu, &cpu, &node, NULL);
+        if (ret == 0) {
+            gro_param.socket_id = node;
+        } else {
+            gro_param.socket_id = 0;
+        }
+        RTE_PER_LCORE(_gro_ctx) = rte_gro_ctx_create(&gro_param);
+    }
 
     if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) {
         return EAGAIN;
@@ -2890,7 +3058,58 @@  netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch,
         rte_spinlock_unlock(&dev->stats_lock);
     }
 
+    /* Need to parse packet header and set necessary fields in mbuf for GRO */
     batch->count = nb_rx;
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        uint16_t l4_proto = 0;
+        int is_frag = 0;
+
+        netdev_dpdk_parse_hdr(packet, 0, &l4_proto, &is_frag);
+        if (packet->mbuf.packet_type & RTE_PTYPE_TUNNEL_VXLAN) {
+            if (packet->mbuf.packet_type & RTE_PTYPE_INNER_L4_UDP) {
+                udp_pkts[nb_udp_rx++] = packet;
+            } else {
+                other_pkts[nb_other_rx++] = packet;
+            }
+        } else {
+            if (packet->mbuf.packet_type & RTE_PTYPE_L4_UDP) {
+                udp_pkts[nb_udp_rx++] = packet;
+            } else {
+                other_pkts[nb_other_rx++] = packet;
+            }
+        }
+    }
+
+    /* Do GRO here if needed, note: IP fragment can be out of order */
+    if (nb_udp_rx) {
+        /* UDP packet must use heavy rte_gro_reassemble */
+        nb_udp_rx = rte_gro_reassemble((struct rte_mbuf **) udp_pkts,
+                                       nb_udp_rx, RTE_PER_LCORE(_gro_ctx));
+        nb_udp_rx += rte_gro_timeout_flush(RTE_PER_LCORE(_gro_ctx), 10000,
+                          RTE_GRO_UDP_IPV4
+                              | RTE_GRO_IPV4_VXLAN_UDP_IPV4,
+                          (struct rte_mbuf **)&udp_pkts[nb_udp_rx],
+                          NETDEV_MAX_BURST - nb_udp_rx);
+    }
+
+    if (nb_other_rx) {
+        /* TCP packet is better for lightweigh rte_gro_reassemble_burst */
+        nb_other_rx = rte_gro_reassemble_burst((struct rte_mbuf **) other_pkts,
+                                         nb_other_rx,
+                                         &gro_param);
+    }
+
+    batch->count = nb_udp_rx + nb_other_rx;
+    if (nb_udp_rx) {
+        memcpy(batch->packets, udp_pkts,
+               nb_udp_rx * sizeof(struct dp_packet *));
+    }
+
+    if (nb_other_rx) {
+        memcpy(&batch->packets[nb_udp_rx], other_pkts,
+               nb_other_rx * sizeof(struct dp_packet *));
+    }
+
     dp_packet_batch_init_packet_fields(batch);
 
     if (qfill) {
@@ -2931,10 +3150,11 @@  netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
     for (i = 0; i < pkt_cnt; i++) {
         pkt = pkts[i];
         if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len)
-            && !(pkt->ol_flags & PKT_TX_TCP_SEG))) {
+            && !(pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
             VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " "
-                         "max_packet_len %d", dev->up.name, pkt->pkt_len,
-                         dev->max_packet_len);
+                         "max_packet_len %d ol_flags 0x%016lx",
+                         dev->up.name, pkt->pkt_len,
+                         dev->max_packet_len, pkt->ol_flags);
             rte_pktmbuf_free(pkt);
             continue;
         }
@@ -3289,7 +3509,6 @@  netdev_dpdk_vhost_send(struct netdev *netdev, int qid,
                        struct dp_packet_batch *batch,
                        bool concurrent_txq OVS_UNUSED)
 {
-
     if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) {
         dpdk_do_tx_copy(netdev, qid, batch);
         dp_packet_delete_batch(batch, true);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 557f139..d8a035a 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -50,6 +50,7 @@ 
 #include <unistd.h>
 
 #include "coverage.h"
+#include "csum.h"
 #include "dp-packet.h"
 #include "dpif-netlink.h"
 #include "dpif-netdev.h"
@@ -1549,8 +1550,31 @@  netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu,
         if (tso) {
             netdev_linux_prepend_vnet_hdr(packet, mtu);
         }
-
-        iov[i].iov_base = dp_packet_data(packet);
+        /* It is a GROed packet which has multiple segments, so need to merge
+         * as a big packet in order that sendmmsg can handle it correctly.
+         */
+        if (packet->mbuf.nb_segs > 1) {
+            struct dp_packet *new_packet =
+                                 dp_packet_new(dp_packet_size(packet));
+            struct rte_mbuf *next = (struct rte_mbuf *)packet;
+            uint32_t offset = 0;
+
+            iov[i].iov_base = dp_packet_data(new_packet);
+            /* Copy multi-seg mbuf data to linear buffer */
+            while (next) {
+                memcpy((uint8_t *)dp_packet_data(new_packet) + offset,
+                       rte_pktmbuf_mtod(next, char *),
+                       next->data_len);
+                offset += next->data_len;
+                next = next->next;
+            }
+            dp_packet_set_size(new_packet, offset);
+            dp_packet_delete(packet);
+            batch->packets[i] = new_packet;
+            packet = new_packet;
+        } else {
+            iov[i].iov_base = dp_packet_data(packet);
+        }
         iov[i].iov_len = dp_packet_size(packet);
         mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
                                             .msg_namelen = sizeof sll,
@@ -6624,17 +6648,93 @@  netdev_linux_parse_vnet_hdr(struct dp_packet *b)
 }
 
 static void
+netdev_linux_set_ol_flags_and_ip_cksum(struct dp_packet *b, int mtu)
+{
+    struct eth_header *eth_hdr;
+    uint16_t l4proto = 0;
+    ovs_be16 eth_type;
+    int l2_len;
+
+    eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
+    if (!eth_hdr) {
+        return;
+    }
+
+    l2_len = ETH_HEADER_LEN;
+    eth_type = eth_hdr->eth_type;
+    if (eth_type_vlan(eth_type)) {
+        struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN);
+
+        if (!vlan) {
+            return;
+        }
+
+        eth_type = vlan->vlan_next_type;
+        l2_len += VLAN_HEADER_LEN;
+    }
+
+    if (eth_type == htons(ETH_TYPE_IP)) {
+        struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
+
+        if (!ip_hdr) {
+            return;
+        }
+
+        ip_hdr->ip_csum = 0;
+        ip_hdr->ip_csum = csum(ip_hdr, sizeof *ip_hdr);
+        l4proto = ip_hdr->ip_proto;
+        dp_packet_hwol_set_tx_ipv4(b);
+    } else if (eth_type == htons(ETH_TYPE_IPV6)) {
+        struct ovs_16aligned_ip6_hdr *nh6;
+
+        nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN);
+        if (!nh6) {
+            return;
+        }
+
+        l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
+        dp_packet_hwol_set_tx_ipv6(b);
+    }
+
+    if (l4proto == IPPROTO_TCP) {
+        /* Note: needn't set tcp checksum */
+        if (dp_packet_size(b) > mtu + b->mbuf.l2_len) {
+            dp_packet_hwol_set_tcp_seg(b);
+        }
+        dp_packet_hwol_set_csum_tcp(b);
+    } else if (l4proto == IPPROTO_UDP) {
+        if (dp_packet_size(b) > mtu + b->mbuf.l2_len) {
+            dp_packet_hwol_set_udp_seg(b);
+        }
+        dp_packet_hwol_set_csum_udp(b);
+    }
+}
+
+static void
 netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu)
 {
-    struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet);
+    struct virtio_net_hdr *vnet;
+
+    /* ol_flags weren't set correctly for received packets which are from
+     * physical port and GROed, so it has to been set again in order that
+     * vnet_hdr can be prepended correctly.
+     */
+    if ((dp_packet_size(b) > mtu + b->mbuf.l2_len)
+        && !dp_packet_hwol_l4_mask(b)) {
+        netdev_linux_set_ol_flags_and_ip_cksum(b, mtu);
+    }
+
+    vnet = dp_packet_push_zeros(b, sizeof *vnet);
 
-    if (dp_packet_hwol_is_tso(b)) {
+    if (dp_packet_hwol_is_tso(b) || dp_packet_hwol_is_uso(b)) {
         uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b))
-                            + TCP_HEADER_LEN;
+                            + b->mbuf.l4_len;
 
         vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len;
         vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len);
-        if (dp_packet_hwol_is_ipv4(b)) {
+        if (dp_packet_hwol_is_uso(b)) {
+            vnet->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+        } else if (dp_packet_hwol_is_ipv4(b)) {
             vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
         } else {
             vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;