diff mbox series

[ovs-dev,V3,2/4] Add GSO support for DPDK data path

Message ID 20200807105648.94860-3-yang_y_yi@163.com
State New
Headers show
Series userspace: enable VXLAN TSO, GSO and GRO | expand

Commit Message

yang_y_yi Aug. 7, 2020, 10:56 a.m. UTC
From: Yi Yang <yangyi01@inspur.com>

GSO(Generic Segment Offload) can segment large UDP
 and TCP packet to small packets per MTU of destination
, especially for the case that physical NIC can't
do hardware offload VXLAN TSO and VXLAN UFO, GSO can
make sure userspace TSO can still work but not drop.

In addition, GSO can help improve UDP performane when
UFO is enabled in VM.

GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is
done in Tx function of physical NIC.

Signed-off-by: Yi Yang <yangyi01@inspur.com>
---
 lib/dp-packet.h    |  21 +++-
 lib/netdev-dpdk.c  | 358 +++++++++++++++++++++++++++++++++++++++++++++++++----
 lib/netdev-linux.c |  17 ++-
 lib/netdev.c       |  67 +++++++---
 4 files changed, 417 insertions(+), 46 deletions(-)

Comments

Ilya Maximets Oct. 27, 2020, 1:02 p.m. UTC | #1
On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> GSO(Generic Segment Offload) can segment large UDP
>  and TCP packet to small packets per MTU of destination
> , especially for the case that physical NIC can't
> do hardware offload VXLAN TSO and VXLAN UFO, GSO can
> make sure userspace TSO can still work but not drop.
> 
> In addition, GSO can help improve UDP performane when
> UFO is enabled in VM.
> 
> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is
> done in Tx function of physical NIC.
> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>  lib/dp-packet.h    |  21 +++-
>  lib/netdev-dpdk.c  | 358 +++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/netdev-linux.c |  17 ++-
>  lib/netdev.c       |  67 +++++++---
>  4 files changed, 417 insertions(+), 46 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 79895f2..c33868d 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -83,6 +83,8 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
>      /* VXLAN TCP Segmentation Offload. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000),
> +    /* UDP Segmentation Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000),
>      /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
>  };
>  
> @@ -97,7 +99,8 @@ enum dp_packet_offload_mask {
>                                       DP_PACKET_OL_TX_IPV6          | \
>                                       DP_PACKET_OL_TX_TCP_CKSUM     | \
>                                       DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM)
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> +                                     DP_PACKET_OL_TX_UDP_SEG)
>  
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \
> @@ -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b)
>      return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG);
>  }
>  
> +/* Returns 'true' if packet 'b' is marked for UDP segmentation offloading. */
> +static inline bool
> +dp_packet_hwol_is_uso(const struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG);
> +}
> +
>  /* Returns 'true' if packet 'b' is marked for IPv4 checksum offloading. */
>  static inline bool
>  dp_packet_hwol_is_ipv4(const struct dp_packet *b)
> @@ -1034,6 +1044,15 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>  }
>  
> +/* Mark packet 'b' for UDP segmentation offloading.  It implies that
> + * either the packet 'b' is marked for IPv4 or IPv6 checksum offloading
> + * and also for UDP checksum offloading. */
> +static inline void
> +dp_packet_hwol_set_udp_seg(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG;
> +}
> +
>  #ifdef DPDK_NETDEV
>  /* Mark packet 'b' for VXLAN TCP segmentation offloading. */
>  static inline void
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 30493ed..888a45e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,13 +38,15 @@
>  #include <rte_errno.h>
>  #include <rte_ethdev.h>
>  #include <rte_flow.h>
> +#include <rte_gso.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
>  #include <rte_pci.h>
>  #include <rte_version.h>
>  #include <rte_vhost.h>
> -#include <rte_ip.h>
> +#include <rte_ip_frag.h>
>  
>  #include "cmap.h"
>  #include "coverage.h"
> @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t;
>                                     | DEV_TX_OFFLOAD_UDP_CKSUM    \
>                                     | DEV_TX_OFFLOAD_IPV4_CKSUM)
>  
> +#define MAX_GSO_MBUFS 64
>  
>  static const struct rte_eth_conf port_conf = {
>      .rxmode = {
> @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
>      return ret;
>  }
>  
> +static uint16_t
> +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype)
> +{
> +        if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) {
> +                return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
> +        } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
> +                return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
> +        }
> +}
> +
>  /* Prepare the packet for HWOL.
>   * Return True if the packet is OK to continue. */
>  static bool
> @@ -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>           * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
>           * outer_l2_len and outer_l3_len must be zeroed.
>           */
> -        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)
> -            || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
> +        if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
>              && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> -                + mbuf->l2_len)))  {
> +                + mbuf->l2_len))  {
>              mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
>              mbuf->l2_len -= sizeof(struct udp_header)
>                          + sizeof(struct vxlanhdr);
> @@ -2249,7 +2261,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>              /* inner IP checksum offload offload */
>              mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>          }
> -    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +    } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
>          /* Handle VLAN TSO */
>              /* no inner IP checksum for IPV6 */
>          if (mbuf->ol_flags & PKT_TX_IPV4) {
> @@ -2273,6 +2285,18 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>          mbuf->outer_l2_len = 0;
>          mbuf->outer_l3_len = 0;
> +
> +        /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO
> +         * APIs, here is a place we can mark it.
> +         */
> +        if ((mbuf->pkt_len > 1464)
> +            && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
> +            if (l4_proto == IPPROTO_UDP) {
> +                mbuf->ol_flags |= PKT_TX_UDP_SEG;
> +            } else if (l4_proto == IPPROTO_TCP) {
> +                mbuf->ol_flags |= PKT_TX_TCP_SEG;
> +            }
> +        }
>      }
>  
>      /* It is possible that l4_len isn't set for vhostuserclient */
> @@ -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
>      }
>  
> +    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
> +        return true;
> +    }
> +
>      if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
>          if (l4_proto != IPPROTO_UDP) {
>              VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>                 mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
>          if (l4_proto != IPPROTO_TCP) {
>              VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +                         " pkt len: %"PRIu32" l4_proto = %d",
> +                         dev->up.name, mbuf->pkt_len, l4_proto);
>              return false;
>          }
>  
> -        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +        if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> +            + mbuf->l2_len) {
>              dp_packet_hwol_set_tcp_seg(pkt);
>          }
>  
> @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          } else {
>              mbuf->tso_segsz = 0;
>          }
> +
> +        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> +            /* PKT_TX_TCP_CKSUM must be cleaned for GSO because
> +             * tcp checksum only can be caculated by software for
> +             * GSO case.
> +             */
> +            mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM;
> +        }
>      }
> +
> +    /* UDP GSO if necessary */
> +    if (l4_proto == IPPROTO_UDP) {
> +        /* VXLAN GSO can be done here */
> +        if ((mbuf->ol_flags & PKT_TX_UDP_SEG) ||
> +            (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> +                             mbuf->l2_len))) {
> +            dp_packet_hwol_set_udp_seg(pkt);
> +
> +            /* For UDP GSO, udp checksum must be calculated by software */
> +            if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> +                void *l3_hdr, *l4_hdr;
> +                struct rte_udp_hdr *udp_hdr;
> +
> +                /* PKT_TX_UDP_CKSUM must be cleaned for GSO because
> +                 * udp checksum only can be caculated by software for
> +                 * GSO case.
> +                 */
> +                mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM;
> +
> +                eth_hdr = (struct rte_ether_hdr *)
> +                    ((uint8_t *)eth_hdr + mbuf->outer_l2_len +
> +                               mbuf->outer_l3_len +
> +                               sizeof(struct udp_header) +
> +                               sizeof(struct vxlanhdr));
> +                l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len -
> +                         sizeof(struct udp_header) -
> +                         sizeof(struct vxlanhdr);
> +                l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len;
> +                ip_hdr = (struct rte_ipv4_hdr *)l3_hdr;
> +                ip_hdr->hdr_checksum = 0;
> +                ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr);
> +                /* Don't touch UDP checksum if it is ip fragment */
> +                if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) {
> +                    udp_hdr = (struct rte_udp_hdr *)l4_hdr;
> +                    udp_hdr->dgram_cksum = 0;
> +                    udp_hdr->dgram_cksum =
> +                        get_udptcp_checksum(l3_hdr, l4_hdr,
> +                                            eth_hdr->ether_type);
> +                }
> +            }
> +
> +            /* FOR GSO, gso_size includes l2_len + l3_len */
> +            mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> +                              mbuf->l2_len;
> +            if (mbuf->tso_segsz > dev->mtu) {
> +                mbuf->tso_segsz = dev->mtu;
> +            }
> +        }
> +    }
> +
>      return true;
>  }
>  
> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
> - * 'pkts', even in case of failure.
> - *
> - * Returns the number of packets that weren't transmitted. */
>  static inline int
> -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> -                         struct rte_mbuf **pkts, int cnt)
> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> +                           struct rte_mbuf **pkts, int cnt)
>  {
>      uint32_t nb_tx = 0;
> -    uint16_t nb_tx_prep = cnt;
> +    uint32_t nb_tx_prep;
>  
> -    if (userspace_tso_enabled()) {
> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> -        if (nb_tx_prep != cnt) {
> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> -                         cnt, rte_strerror(rte_errno));
> -        }
> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> +    if (nb_tx_prep != cnt) {
> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> +                          "Only %u/%u are valid: %s",
> +                     dev->up.name, nb_tx_prep,
> +                     cnt, rte_strerror(rte_errno));
>      }
>  
>      while (nb_tx != nb_tx_prep) {
> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>      return cnt - nb_tx;
>  }
>  
> +static inline void
> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)

I didn't review the patch, only had a quick glance, but this part
bothers me.  OVS doesn't support multi-segment mbufs, so it should not
be possible for such mbufs being transmitted by OVS.  So, I do not
understand why this function needs to work with such mbufs.


> +{
> +    uint16_t l3_offset = mbuf->outer_l2_len + mbuf->outer_l3_len
> +                         + mbuf->l2_len;
> +    struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
> +        (rte_pktmbuf_mtod(mbuf, char *) + l3_offset);
> +    struct rte_tcp_hdr *tcp_hdr;
> +    uint32_t l4_hdr_len;
> +    uint8_t *l4_hdr;
> +    struct rte_mbuf *next = mbuf->next;
> +    uint32_t cksum = 0;
> +    uint16_t l4_proto;
> +    uint32_t inner_cksum;
> +
> +    l4_proto = ipv4_hdr->next_proto_id;
> +    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
> +        return;
> +    }
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        /* For TCP GSO, inner TCP header is in every seg,
> +         * TCP checksum has to be calculated by software.
> +         */
> +
> +        l4_hdr_len = mbuf->data_len - l3_offset
> +                     - sizeof(struct rte_ipv4_hdr);
> +        l4_hdr = (uint8_t *)(ipv4_hdr + 1);
> +        tcp_hdr = (struct rte_tcp_hdr *)l4_hdr;
> +        tcp_hdr->cksum = 0;
> +    }
> +
> +    /* Set inner ip checksum */
> +    ipv4_hdr->hdr_checksum = 0;
> +    ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        cksum = rte_raw_cksum(l4_hdr, l4_hdr_len);
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        if (next == NULL) {
> +            /* It wasn't GSOed */
> +            cksum = rte_raw_cksum(ipv4_hdr + 1,
> +                                  ntohs(ipv4_hdr->total_length)
> +                                      - sizeof(struct rte_ipv4_hdr));
> +        } else {
> +            cksum = 0;
> +        }
> +    }
> +
> +    /* It was GSOed */
> +    while (next) {
> +        cksum += rte_raw_cksum(rte_pktmbuf_mtod(next, char *), next->data_len);
> +        next = next->next;
> +    }
> +
> +    /* Save cksum to inner_cksum, outer udp checksum needs it */
> +    inner_cksum = cksum;
> +
> +    cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0);
> +    cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
> +    cksum = (~cksum) & 0xffff;
> +    if (cksum == 0) {
> +        cksum = 0xffff;
> +    }
> +
> +    /* Set inner TCP checksum */
> +    if (l4_proto == IPPROTO_TCP) {
> +        tcp_hdr->cksum = (uint16_t)cksum;
> +    }
> +
> +    /* Set outer udp checksum in case of VXLAN */
> +    if (mbuf->outer_l2_len != 0) {
> +        ipv4_hdr = (struct rte_ipv4_hdr *)
> +            (rte_pktmbuf_mtod(mbuf, char *) + mbuf->outer_l2_len);
> +        struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *)
> +            (ipv4_hdr + 1);
> +
> +        /* Set outer ip checksum */
> +        ipv4_hdr->hdr_checksum = 0;
> +        ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
> +
> +        udp_hdr->dgram_cksum = 0;
> +        cksum = rte_ipv4_phdr_cksum(ipv4_hdr, 0);
> +        cksum += rte_raw_cksum(udp_hdr, mbuf->l2_len + mbuf->l3_len);
> +        cksum += inner_cksum;
> +        if (l4_proto == IPPROTO_TCP) {
> +            cksum += tcp_hdr->cksum;
> +        }
> +        cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
> +        cksum = (~cksum) & 0xffff;
> +        if (cksum == 0) {
> +            cksum = 0xffff;
> +        }
> +        udp_hdr->dgram_cksum = (uint16_t)cksum;
> +    }
> +}
> +
> +/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
> + * 'pkts', even in case of failure.
> + *
> + * Returns the number of packets that weren't transmitted. */
> +static inline int
> +netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> +                         struct rte_mbuf **pkts, int cnt)
> +{
> +    uint32_t nb_tx = 0;
> +    int i;
> +    int ret;
> +    int failures = 0;
> +
> +    if (userspace_tso_enabled()) {
> +        /* The best point to do gso */
> +        struct rte_gso_ctx gso_ctx;
> +        struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
> +        int tx_start = -1;
> +
> +        /* Setup gso context */
> +        gso_ctx.direct_pool = dev->dpdk_mp->mp;
> +        gso_ctx.indirect_pool = dev->dpdk_mp->mp;
> +
> +        /* Do GSO if needed */
> +        for (i = 0; i < cnt; i++) {
> +            if (((pkts[i]->ol_flags & PKT_TX_UDP_SEG) &&
> +                  !(dev->hw_ol_features & DEV_TX_OFFLOAD_UDP_TSO)) ||
> +                ((pkts[i]->ol_flags & PKT_TX_TCP_SEG) &&
> +                  ((!(dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD)
> +                   && (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN))
> +                   || !(dev->hw_ol_features & DEV_TX_OFFLOAD_TCP_TSO)))) {
> +                /* Send non GSO packets before pkts[i] */
> +                if (tx_start != -1) {
> +                    failures += __netdev_dpdk_eth_tx_burst(
> +                                    dev, qid,
> +                                    pkts + tx_start,
> +                                    i - tx_start);
> +                }
> +                tx_start = -1;
> +
> +                gso_ctx.gso_types = 0;
> +                gso_ctx.gso_size = pkts[i]->tso_segsz;
> +                gso_ctx.flag = 0;
> +                if (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +                    gso_ctx.gso_types |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> +                }
> +                if (pkts[i]->ol_flags & PKT_TX_UDP_SEG) {
> +                    gso_ctx.gso_types |= DEV_TX_OFFLOAD_UDP_TSO;
> +                } else if (pkts[i]->ol_flags & PKT_TX_TCP_SEG) {
> +                    gso_ctx.gso_types |= DEV_TX_OFFLOAD_TCP_TSO;
> +                    pkts[i]->ol_flags &= ~PKT_TX_TCP_CKSUM;
> +                }
> +                ret = rte_gso_segment(pkts[i], /* packet to segment */
> +                                      &gso_ctx, /* gso context */
> +                                      /* gso output mbufs */
> +                                      (struct rte_mbuf **)&gso_mbufs,
> +                                      MAX_GSO_MBUFS);
> +                if (ret < 0) {
> +                    rte_pktmbuf_free(pkts[i]);
> +                } else {
> +                    int j, k;
> +                    struct rte_mbuf * next_part;
> +                    nb_tx = ret;
> +                    for (j = 0; j < nb_tx; j++) {
> +                        set_multiseg_udptcp_cksum(gso_mbufs[j]);
> +                        /* Clear them because of no offload */
> +                        gso_mbufs[j]->ol_flags = 0;
> +                        gso_mbufs[j]->outer_l2_len = 0;
> +                        gso_mbufs[j]->outer_l3_len = 0;
> +                        gso_mbufs[j]->l2_len = 0;
> +                        gso_mbufs[j]->l3_len = 0;
> +                        gso_mbufs[j]->l4_len = 0;
> +                        next_part = gso_mbufs[j];
> +                        for (k = 0; k < gso_mbufs[j]->nb_segs; k++) {
> +                            next_part = next_part->next;
> +                        }
> +                    }
> +                    __netdev_dpdk_eth_tx_burst(dev, qid, gso_mbufs, nb_tx);
> +                }
> +                continue;
> +            }
> +            if (tx_start == -1) {
> +                tx_start = i;
> +            }
> +        }
> +
> +        if (tx_start != -1) {
> +            /* Send non GSO packets before pkts[i] */
> +            failures += __netdev_dpdk_eth_tx_burst(dev, qid, pkts + tx_start,
> +                                       i - tx_start);
> +        }
> +        return failures;
> +    }
> +
> +    return __netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt);
> +}
> +
>  static inline bool
>  netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter,
>                                       struct rte_meter_srtcm_profile *profile,
> @@ -2786,10 +3064,24 @@ out:
>      }
>  }
>  
> +struct shinfo_arg {
> +    void * buf;
> +    struct rte_mbuf *mbuf;
> +};
> +
> +/* For GSO case, the extended mbuf only can be freed by
> + * netdev_dpdk_extbuf_free
> + */
>  static void
> -netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
> +netdev_dpdk_extbuf_free(struct rte_mbuf *m, void *opaque)
>  {
> -    rte_free(opaque);
> +    struct shinfo_arg *arg = (struct shinfo_arg *)opaque;
> +
> +    rte_free(arg->buf);
> +    if (m != arg->mbuf) {
> +        rte_pktmbuf_free(arg->mbuf);
> +    }
> +    free(arg);
>  }
>  
>  static struct rte_mbuf *
> @@ -2821,8 +3113,11 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
>  
>      /* Initialize shinfo. */
>      if (shinfo) {
> +        struct shinfo_arg *arg = xmalloc(sizeof(struct shinfo_arg));
> +        arg->buf = buf;
> +        arg->mbuf = pkt;
>          shinfo->free_cb = netdev_dpdk_extbuf_free;
> -        shinfo->fcb_opaque = buf;
> +        shinfo->fcb_opaque = arg;
>          rte_mbuf_ext_refcnt_set(shinfo, 1);
>      } else {
>          shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
> @@ -2852,6 +3147,10 @@ dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
>          return NULL;
>      }
>  
> +    if (unlikely(pkt->shinfo != NULL)) {
> +        pkt->shinfo = NULL;
> +    }
> +
>      if (rte_pktmbuf_tailroom(pkt) >= data_len) {
>          return pkt;
>      }
> @@ -5192,6 +5491,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
>      int err;
>      uint64_t vhost_flags = 0;
>      uint64_t vhost_unsup_flags;
> +    uint64_t vhost_supported_flags;
>      bool zc_enabled;
>  
>      ovs_mutex_lock(&dev->mutex);
> @@ -5277,6 +5577,16 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
>              goto unlock;
>          }
>  
> +        err = rte_vhost_driver_get_features(dev->vhost_id,
> +                                            &vhost_supported_flags);
> +        if (err) {
> +            VLOG_ERR("rte_vhost_driver_get_features failed for "
> +                     "vhost user client port: %s\n", dev->up.name);
> +            goto unlock;
> +        }
> +        VLOG_INFO("vhostuserclient port %s features: 0x%016lx",
> +                  dev->up.name, vhost_supported_flags);
> +
>          err = rte_vhost_driver_start(dev->vhost_id);
>          if (err) {
>              VLOG_ERR("rte_vhost_driver_start failed for vhost user "
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 9f830b4..557f139 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -6566,6 +6566,16 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>  
>          l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
>          dp_packet_hwol_set_l4_len(b, l4_len);
> +    } else if (*l4proto == IPPROTO_UDP) {
> +        struct udp_header *udp_hdr =  dp_packet_at(b, l2_len + l3_len,
> +                                          sizeof(struct udp_header));
> +
> +        if (!udp_hdr) {
> +            return -EINVAL;
> +        }
> +
> +        l4_len = sizeof(struct udp_header);
> +        dp_packet_hwol_set_l4_len(b, l4_len);
>      }
>  
>      return 0;
> @@ -6581,10 +6591,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
>          return -EINVAL;
>      }
>  
> -    if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
> -        return 0;
> -    }
> -
>      if (netdev_linux_parse_l2(b, &l4proto)) {
>          return -EINVAL;
>      }
> @@ -6609,6 +6615,9 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b)
>              || type == VIRTIO_NET_HDR_GSO_TCPV6) {
>              dp_packet_hwol_set_tcp_seg(b);
>          }
> +        if (type == VIRTIO_NET_HDR_GSO_UDP) {
> +            dp_packet_hwol_set_udp_seg(b);
> +        }
>      }
>  
>      return 0;
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 64583d1..02f28c8 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -825,23 +825,41 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
>  {
>      uint64_t l4_mask;
>  
> -    if (dp_packet_hwol_is_vxlan_tcp_seg(packet)
> -        && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet))
> -        && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) {
> -        /* Fall back to GSO in software. */
> -        VLOG_ERR_BUF(errormsg, "No VXLAN TSO support");
> -        return false;
> -    }
> -
> -    if (dp_packet_hwol_is_tso(packet)
> -        && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> -            /* Fall back to GSO in software. */
> -            VLOG_ERR_BUF(errormsg, "No TSO support");
> -            return false;
> -    }
> -
> +    /* GSO can handle TSO by software even if device can't handle hardware
> +     * offload, so needn't check it here.
> +     */
>      l4_mask = dp_packet_hwol_l4_mask(packet);
>      if (l4_mask) {
> +        /* Calculate checksum for VLAN TSO case when no hardware offload
> +         * feature is available. Note: for VXLAN TSO case, checksum has
> +         * been calculated before here, so it won't be done here again
> +         * because checksum flags in packet->m.ol_flags have been cleaned.
> +         */
> +        if (dp_packet_hwol_l4_is_tcp(packet)
> +            && !dp_packet_hwol_is_vxlan_tcp_seg(packet)
> +            && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
> +            packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM;
> +            /* Only calculate TCP checksum for non-TSO packet,
> +             * it will be calculated after GSO for TSO packet.
> +             */
> +            if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) {
> +                calculate_tcpudp_checksum(packet);
> +            }
> +            return true;
> +        } else if (dp_packet_hwol_l4_is_udp(packet)
> +            && !dp_packet_hwol_is_vxlan_tcp_seg(packet)
> +            && !(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) {
> +            packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM;
> +            /* Only calculate UDP checksum for non-UFO packet,
> +             * it will be calculated immediately before GSO for
> +             * UFO packet.
> +             */
> +            if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) {
> +                calculate_tcpudp_checksum(packet);
> +            }
> +            return true;
> +        }
> +
>          if (dp_packet_hwol_l4_is_tcp(packet)) {
>              if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
>                  /* Fall back to TCP csum in software. */
> @@ -1013,11 +1031,26 @@ netdev_push_header(const struct netdev *netdev,
>                  /* VXLAN offload can't support udp checksum offload
>                   * for inner udp packet, so udp checksum must be set
>                   * before push header in order that outer checksum can
> -                 * be set correctly.
> +                 * be set correctly. But GSO code will set udp checksum
> +                 * if packet->mbuf.ol_flags has DP_PACKET_OL_TX_UDP_SEG.
>                   */
>                  if (dp_packet_hwol_l4_is_udp(packet)) {
>                      packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM;
> -                    calculate_tcpudp_checksum(packet);
> +                    /* Only calculate UDP checksum for non-UFO packet,
> +                     * it will be calculated immediately before GSO for
> +                     * UFO packet.
> +                     */
> +                    if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) {
> +                        calculate_tcpudp_checksum(packet);
> +                    }
> +                } else if (dp_packet_hwol_l4_is_tcp(packet)) {
> +                    packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM;
> +                    /* Only calculate TCP checksum for non-TSO packet,
> +                     * it will be calculated after GSO for TSO packet.
> +                     */
> +                    if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) {
> +                        calculate_tcpudp_checksum(packet);
> +                    }
>                  }
>              }
>              netdev->netdev_class->push_header(netdev, packet, data);
>
Yi Yang (杨燚)-云服务集团 Oct. 28, 2020, 12:56 a.m. UTC | #2
-----邮件原件-----
发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets
发送时间: 2020年10月27日 21:03
收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
抄送: fbl@sysclose.org; i.maximets@ovn.org
主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path

On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> GSO(Generic Segment Offload) can segment large UDP  and TCP packet to 
> small packets per MTU of destination , especially for the case that 
> physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO 
> can make sure userspace TSO can still work but not drop.
> 
> In addition, GSO can help improve UDP performane when UFO is enabled 
> in VM.
> 
> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx 
> function of physical NIC.
> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>  lib/dp-packet.h    |  21 +++-
>  lib/netdev-dpdk.c  | 358 
> +++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/netdev-linux.c |  17 ++-
>  lib/netdev.c       |  67 +++++++---
>  4 files changed, 417 insertions(+), 46 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 79895f2..c33868d 
> 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -83,6 +83,8 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
>      /* VXLAN TCP Segmentation Offload. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 
> 0x1000),
> +    /* UDP Segmentation Offload. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000),
>      /* Adding new field requires adding to 
> DP_PACKET_OL_SUPPORTED_MASK. */  };
>  
> @@ -97,7 +99,8 @@ enum dp_packet_offload_mask {
>                                       DP_PACKET_OL_TX_IPV6          | \
>                                       DP_PACKET_OL_TX_TCP_CKSUM     | \
>                                       DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM)
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> +                                     DP_PACKET_OL_TX_UDP_SEG)
>  
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \ @@ 
> -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b)
>      return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG);  
> }
>  
> +/* Returns 'true' if packet 'b' is marked for UDP segmentation 
> +offloading. */ static inline bool dp_packet_hwol_is_uso(const struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); 
> +}
> +
>  /* Returns 'true' if packet 'b' is marked for IPv4 checksum 
> offloading. */  static inline bool  dp_packet_hwol_is_ipv4(const 
> struct dp_packet *b) @@ -1034,6 +1044,15 @@ 
> dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;  }
>  
> +/* Mark packet 'b' for UDP segmentation offloading.  It implies that
> + * either the packet 'b' is marked for IPv4 or IPv6 checksum 
> +offloading
> + * and also for UDP checksum offloading. */ static inline void 
> +dp_packet_hwol_set_udp_seg(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; }
> +
>  #ifdef DPDK_NETDEV
>  /* Mark packet 'b' for VXLAN TCP segmentation offloading. */  static 
> inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 
> 30493ed..888a45e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,13 +38,15 @@
>  #include <rte_errno.h>
>  #include <rte_ethdev.h>
>  #include <rte_flow.h>
> +#include <rte_gso.h>
> +#include <rte_ip.h>
>  #include <rte_malloc.h>
>  #include <rte_mbuf.h>
>  #include <rte_meter.h>
>  #include <rte_pci.h>
>  #include <rte_version.h>
>  #include <rte_vhost.h>
> -#include <rte_ip.h>
> +#include <rte_ip_frag.h>
>  
>  #include "cmap.h"
>  #include "coverage.h"
> @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t;
>                                     | DEV_TX_OFFLOAD_UDP_CKSUM    \
>                                     | DEV_TX_OFFLOAD_IPV4_CKSUM)
>  
> +#define MAX_GSO_MBUFS 64
>  
>  static const struct rte_eth_conf port_conf = {
>      .rxmode = {
> @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
>      return ret;
>  }
>  
> +static uint16_t
> +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) {
> +        if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) {
> +                return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
> +        } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
> +                return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
> +        }
> +}
> +
>  /* Prepare the packet for HWOL.
>   * Return True if the packet is OK to continue. */  static bool @@ 
> -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>           * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
>           * outer_l2_len and outer_l3_len must be zeroed.
>           */
> -        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)
> -            || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
> +        if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
>              && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> -                + mbuf->l2_len)))  {
> +                + mbuf->l2_len))  {
>              mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
>              mbuf->l2_len -= sizeof(struct udp_header)
>                          + sizeof(struct vxlanhdr); @@ -2249,7 +2261,7 
> @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>              /* inner IP checksum offload offload */
>              mbuf->ol_flags |= PKT_TX_IP_CKSUM;
>          }
> -    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +    } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
>          /* Handle VLAN TSO */
>              /* no inner IP checksum for IPV6 */
>          if (mbuf->ol_flags & PKT_TX_IPV4) { @@ -2273,6 +2285,18 @@ 
> netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>          mbuf->outer_l2_len = 0;
>          mbuf->outer_l3_len = 0;
> +
> +        /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO
> +         * APIs, here is a place we can mark it.
> +         */
> +        if ((mbuf->pkt_len > 1464)
> +            && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
> +            if (l4_proto == IPPROTO_UDP) {
> +                mbuf->ol_flags |= PKT_TX_UDP_SEG;
> +            } else if (l4_proto == IPPROTO_TCP) {
> +                mbuf->ol_flags |= PKT_TX_TCP_SEG;
> +            }
> +        }
>      }
>  
>      /* It is possible that l4_len isn't set for vhostuserclient */ @@ 
> -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
>      }
>  
> +    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
> +        return true;
> +    }
> +
>      if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
>          if (l4_proto != IPPROTO_UDP) {
>              VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>                 mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
>          if (l4_proto != IPPROTO_TCP) {
>              VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +                         " pkt len: %"PRIu32" l4_proto = %d",
> +                         dev->up.name, mbuf->pkt_len, l4_proto);
>              return false;
>          }
>  
> -        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +        if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
> +            + mbuf->l2_len) {
>              dp_packet_hwol_set_tcp_seg(pkt);
>          }
>  
> @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          } else {
>              mbuf->tso_segsz = 0;
>          }
> +
> +        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
> +            /* PKT_TX_TCP_CKSUM must be cleaned for GSO because
> +             * tcp checksum only can be caculated by software for
> +             * GSO case.
> +             */
> +            mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM;
> +        }
>      }
> +
> +    /* UDP GSO if necessary */
> +    if (l4_proto == IPPROTO_UDP) {
> +        /* VXLAN GSO can be done here */
> +        if ((mbuf->ol_flags & PKT_TX_UDP_SEG) ||
> +            (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> +                             mbuf->l2_len))) {
> +            dp_packet_hwol_set_udp_seg(pkt);
> +
> +            /* For UDP GSO, udp checksum must be calculated by software */
> +            if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> +                void *l3_hdr, *l4_hdr;
> +                struct rte_udp_hdr *udp_hdr;
> +
> +                /* PKT_TX_UDP_CKSUM must be cleaned for GSO because
> +                 * udp checksum only can be caculated by software for
> +                 * GSO case.
> +                 */
> +                mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM;
> +
> +                eth_hdr = (struct rte_ether_hdr *)
> +                    ((uint8_t *)eth_hdr + mbuf->outer_l2_len +
> +                               mbuf->outer_l3_len +
> +                               sizeof(struct udp_header) +
> +                               sizeof(struct vxlanhdr));
> +                l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len -
> +                         sizeof(struct udp_header) -
> +                         sizeof(struct vxlanhdr);
> +                l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len;
> +                ip_hdr = (struct rte_ipv4_hdr *)l3_hdr;
> +                ip_hdr->hdr_checksum = 0;
> +                ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr);
> +                /* Don't touch UDP checksum if it is ip fragment */
> +                if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) {
> +                    udp_hdr = (struct rte_udp_hdr *)l4_hdr;
> +                    udp_hdr->dgram_cksum = 0;
> +                    udp_hdr->dgram_cksum =
> +                        get_udptcp_checksum(l3_hdr, l4_hdr,
> +                                            eth_hdr->ether_type);
> +                }
> +            }
> +
> +            /* FOR GSO, gso_size includes l2_len + l3_len */
> +            mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
> +                              mbuf->l2_len;
> +            if (mbuf->tso_segsz > dev->mtu) {
> +                mbuf->tso_segsz = dev->mtu;
> +            }
> +        }
> +    }
> +
>      return true;
>  }
>  
> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>      return cnt;
>  }
>  
> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes 
> ownership of
> - * 'pkts', even in case of failure.
> - *
> - * Returns the number of packets that weren't transmitted. */  static 
> inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> -                         struct rte_mbuf **pkts, int cnt)
> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> +                           struct rte_mbuf **pkts, int cnt)
>  {
>      uint32_t nb_tx = 0;
> -    uint16_t nb_tx_prep = cnt;
> +    uint32_t nb_tx_prep;
>  
> -    if (userspace_tso_enabled()) {
> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> -        if (nb_tx_prep != cnt) {
> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> -                         cnt, rte_strerror(rte_errno));
> -        }
> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> +    if (nb_tx_prep != cnt) {
> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> +                          "Only %u/%u are valid: %s",
> +                     dev->up.name, nb_tx_prep,
> +                     cnt, rte_strerror(rte_errno));
>      }
>  
>      while (nb_tx != nb_tx_prep) {
> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>      return cnt - nb_tx;
>  }
>  
> +static inline void
> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)

I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.

[Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
William Tu Feb. 7, 2021, 3:45 p.m. UTC | #3
On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote:
>
> -----邮件原件-----
> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets
> 发送时间: 2020年10月27日 21:03
> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
> 抄送: fbl@sysclose.org; i.maximets@ovn.org
> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
>
> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
> > From: Yi Yang <yangyi01@inspur.com>
> >
> > GSO(Generic Segment Offload) can segment large UDP  and TCP packet to
> > small packets per MTU of destination , especially for the case that
> > physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO
> > can make sure userspace TSO can still work but not drop.
> >
> > In addition, GSO can help improve UDP performane when UFO is enabled
> > in VM.
> >
> > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx
> > function of physical NIC.
> >
> > Signed-off-by: Yi Yang <yangyi01@inspur.com>
> > ---
> >  lib/dp-packet.h    |  21 +++-
> >  lib/netdev-dpdk.c  | 358
> > +++++++++++++++++++++++++++++++++++++++++++++++++----
> >  lib/netdev-linux.c |  17 ++-
> >  lib/netdev.c       |  67 +++++++---
> >  4 files changed, 417 insertions(+), 46 deletions(-)

snip

> >
> > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
> >      return cnt;
> >  }
> >
> > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> > ownership of
> > - * 'pkts', even in case of failure.
> > - *
> > - * Returns the number of packets that weren't transmitted. */  static
> > inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> > -                         struct rte_mbuf **pkts, int cnt)
> > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> > +                           struct rte_mbuf **pkts, int cnt)
> >  {
> >      uint32_t nb_tx = 0;
> > -    uint16_t nb_tx_prep = cnt;
> > +    uint32_t nb_tx_prep;
> >
> > -    if (userspace_tso_enabled()) {
> > -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> > -        if (nb_tx_prep != cnt) {
> > -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> > -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> > -                         cnt, rte_strerror(rte_errno));
> > -        }
> > +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> > +    if (nb_tx_prep != cnt) {
> > +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> > +                          "Only %u/%u are valid: %s",
> > +                     dev->up.name, nb_tx_prep,
> > +                     cnt, rte_strerror(rte_errno));
> >      }
> >
> >      while (nb_tx != nb_tx_prep) {
> > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> >      return cnt - nb_tx;
> >  }
> >
> > +static inline void
> > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
>
> I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.
>
> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
>

Hi Ilya,

Now I understand Yi Yang's point better and I agree with him.
Looks like the patch does the GSO at the DPDK TX function.
It creates multi-seg mbuf after rte_gso_segment(), but will immediately
send out the multi-seg mbuf to DPDK port, without traversing inside other
part of OVS code. I guess this case it should work OK?

William
Yi Yang (杨燚)-云服务集团 Feb. 8, 2021, 12:56 a.m. UTC | #4
Yes, GSO  is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue.

-----邮件原件-----
发件人: William Tu [mailto:u9012063@gmail.com] 
发送时间: 2021年2月7日 23:46
收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com>
抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org
主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path

On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote:
>
> -----邮件原件-----
> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets
> 发送时间: 2020年10月27日 21:03
> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
> 抄送: fbl@sysclose.org; i.maximets@ovn.org
> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
>
> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
> > From: Yi Yang <yangyi01@inspur.com>
> >
> > GSO(Generic Segment Offload) can segment large UDP  and TCP packet 
> > to small packets per MTU of destination , especially for the case 
> > that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, 
> > GSO can make sure userspace TSO can still work but not drop.
> >
> > In addition, GSO can help improve UDP performane when UFO is enabled 
> > in VM.
> >
> > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx 
> > function of physical NIC.
> >
> > Signed-off-by: Yi Yang <yangyi01@inspur.com>
> > ---
> >  lib/dp-packet.h    |  21 +++-
> >  lib/netdev-dpdk.c  | 358
> > +++++++++++++++++++++++++++++++++++++++++++++++++----
> >  lib/netdev-linux.c |  17 ++-
> >  lib/netdev.c       |  67 +++++++---
> >  4 files changed, 417 insertions(+), 46 deletions(-)

snip

> >
> > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
> >      return cnt;
> >  }
> >
> > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes 
> > ownership of
> > - * 'pkts', even in case of failure.
> > - *
> > - * Returns the number of packets that weren't transmitted. */  
> > static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> > -                         struct rte_mbuf **pkts, int cnt)
> > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> > +                           struct rte_mbuf **pkts, int cnt)
> >  {
> >      uint32_t nb_tx = 0;
> > -    uint16_t nb_tx_prep = cnt;
> > +    uint32_t nb_tx_prep;
> >
> > -    if (userspace_tso_enabled()) {
> > -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> > -        if (nb_tx_prep != cnt) {
> > -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> > -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> > -                         cnt, rte_strerror(rte_errno));
> > -        }
> > +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> > +    if (nb_tx_prep != cnt) {
> > +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> > +                          "Only %u/%u are valid: %s",
> > +                     dev->up.name, nb_tx_prep,
> > +                     cnt, rte_strerror(rte_errno));
> >      }
> >
> >      while (nb_tx != nb_tx_prep) {
> > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> >      return cnt - nb_tx;
> >  }
> >
> > +static inline void
> > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
>
> I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.
>
> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
>

Hi Ilya,

Now I understand Yi Yang's point better and I agree with him.
Looks like the patch does the GSO at the DPDK TX function.
It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK?

William
Ilya Maximets Feb. 8, 2021, 2:36 p.m. UTC | #5
On 2/8/21 1:56 AM, Yi Yang (杨燚)-云服务集团 wrote:
> Yes, GSO  is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue.
> 
> -----邮件原件-----
> 发件人: William Tu [mailto:u9012063@gmail.com] 
> 发送时间: 2021年2月7日 23:46
> 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com>
> 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org
> 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path
> 
> On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote:
>>
>> -----邮件原件-----
>> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets
>> 发送时间: 2020年10月27日 21:03
>> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
>> 抄送: fbl@sysclose.org; i.maximets@ovn.org
>> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
>>
>> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
>>> From: Yi Yang <yangyi01@inspur.com>
>>>
>>> GSO(Generic Segment Offload) can segment large UDP  and TCP packet 
>>> to small packets per MTU of destination , especially for the case 
>>> that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, 
>>> GSO can make sure userspace TSO can still work but not drop.
>>>
>>> In addition, GSO can help improve UDP performane when UFO is enabled 
>>> in VM.
>>>
>>> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx 
>>> function of physical NIC.
>>>
>>> Signed-off-by: Yi Yang <yangyi01@inspur.com>
>>> ---
>>>  lib/dp-packet.h    |  21 +++-
>>>  lib/netdev-dpdk.c  | 358
>>> +++++++++++++++++++++++++++++++++++++++++++++++++----
>>>  lib/netdev-linux.c |  17 ++-
>>>  lib/netdev.c       |  67 +++++++---
>>>  4 files changed, 417 insertions(+), 46 deletions(-)
> 
> snip
> 
>>>
>>> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>>>      return cnt;
>>>  }
>>>
>>> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes 
>>> ownership of
>>> - * 'pkts', even in case of failure.
>>> - *
>>> - * Returns the number of packets that weren't transmitted. */  
>>> static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>>> -                         struct rte_mbuf **pkts, int cnt)
>>> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>>> +                           struct rte_mbuf **pkts, int cnt)
>>>  {
>>>      uint32_t nb_tx = 0;
>>> -    uint16_t nb_tx_prep = cnt;
>>> +    uint32_t nb_tx_prep;
>>>
>>> -    if (userspace_tso_enabled()) {
>>> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
>>> -        if (nb_tx_prep != cnt) {
>>> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
>>> -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
>>> -                         cnt, rte_strerror(rte_errno));
>>> -        }
>>> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
>>> +    if (nb_tx_prep != cnt) {
>>> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
>>> +                          "Only %u/%u are valid: %s",
>>> +                     dev->up.name, nb_tx_prep,
>>> +                     cnt, rte_strerror(rte_errno));
>>>      }
>>>
>>>      while (nb_tx != nb_tx_prep) {
>>> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>>>      return cnt - nb_tx;
>>>  }
>>>
>>> +static inline void
>>> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
>>
>> I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.
>>
>> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
>>
> 
> Hi Ilya,
> 
> Now I understand Yi Yang's point better and I agree with him.
> Looks like the patch does the GSO at the DPDK TX function.
> It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK?

Hi.  GSO itself should be possible to implement, but we should
not enable support for multi-segment mbufs in the NIC, since this
might end up in multi-segment packets being received by OVS
causing lots of trouble (for example dp_packet_clone() uses simple
memcpy() that will result in invalid memory reads.)

GSO should be implemented in a same way TSO implemented.  Right now
TSO implementation has no software fallback to segment large packets
if netdev doesn't support - it it simply drops them.  However,
software fallback should be implemented someday and it should be
implemented in a generic way for all the netdev types instead of
doing it only for one (netdev-dpdk).  Brief look at the patch tells
me that the generic check from netdev_send_prepare_packet() was
removed.  This means that huge packets could reach e.g. netdev-afxdp
that will not behave correctly.
Generic software fallback should produce sequence of 'dp_packet's
instead of multi-segment mbufs, so there should be no need to enable
support for multi-segment mbufs in DPDK PMD.

Best regards, Ilya Maximets.
William Tu Feb. 8, 2021, 7:54 p.m. UTC | #6
On Mon, Feb 8, 2021 at 6:36 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 2/8/21 1:56 AM, Yi Yang (杨燚)-云服务集团 wrote:
> > Yes, GSO  is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue.
> >
> > -----邮件原件-----
> > 发件人: William Tu [mailto:u9012063@gmail.com]
> > 发送时间: 2021年2月7日 23:46
> > 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com>
> > 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org
> > 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path
> >
> > On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote:
> >>
> >> -----邮件原件-----
> >> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets
> >> 发送时间: 2020年10月27日 21:03
> >> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
> >> 抄送: fbl@sysclose.org; i.maximets@ovn.org
> >> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
> >>
> >> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
> >>> From: Yi Yang <yangyi01@inspur.com>
> >>>
> >>> GSO(Generic Segment Offload) can segment large UDP  and TCP packet
> >>> to small packets per MTU of destination , especially for the case
> >>> that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO,
> >>> GSO can make sure userspace TSO can still work but not drop.
> >>>
> >>> In addition, GSO can help improve UDP performane when UFO is enabled
> >>> in VM.
> >>>
> >>> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx
> >>> function of physical NIC.
> >>>
> >>> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> >>> ---
> >>>  lib/dp-packet.h    |  21 +++-
> >>>  lib/netdev-dpdk.c  | 358
> >>> +++++++++++++++++++++++++++++++++++++++++++++++++----
> >>>  lib/netdev-linux.c |  17 ++-
> >>>  lib/netdev.c       |  67 +++++++---
> >>>  4 files changed, 417 insertions(+), 46 deletions(-)
> >
> > snip
> >
> >>>
> >>> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
> >>>      return cnt;
> >>>  }
> >>>
> >>> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
> >>> ownership of
> >>> - * 'pkts', even in case of failure.
> >>> - *
> >>> - * Returns the number of packets that weren't transmitted. */
> >>> static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> >>> -                         struct rte_mbuf **pkts, int cnt)
> >>> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> >>> +                           struct rte_mbuf **pkts, int cnt)
> >>>  {
> >>>      uint32_t nb_tx = 0;
> >>> -    uint16_t nb_tx_prep = cnt;
> >>> +    uint32_t nb_tx_prep;
> >>>
> >>> -    if (userspace_tso_enabled()) {
> >>> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> >>> -        if (nb_tx_prep != cnt) {
> >>> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> >>> -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
> >>> -                         cnt, rte_strerror(rte_errno));
> >>> -        }
> >>> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> >>> +    if (nb_tx_prep != cnt) {
> >>> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> >>> +                          "Only %u/%u are valid: %s",
> >>> +                     dev->up.name, nb_tx_prep,
> >>> +                     cnt, rte_strerror(rte_errno));
> >>>      }
> >>>
> >>>      while (nb_tx != nb_tx_prep) {
> >>> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
> >>>      return cnt - nb_tx;
> >>>  }
> >>>
> >>> +static inline void
> >>> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
> >>
> >> I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.
> >>
> >> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
> >>
> >
> > Hi Ilya,
> >
> > Now I understand Yi Yang's point better and I agree with him.
> > Looks like the patch does the GSO at the DPDK TX function.
> > It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK?
>
> Hi.  GSO itself should be possible to implement, but we should
> not enable support for multi-segment mbufs in the NIC, since this
> might end up in multi-segment packets being received by OVS
> causing lots of trouble (for example dp_packet_clone() uses simple
> memcpy() that will result in invalid memory reads.)

I see your point, thanks!
I guess you're saying that we have to do s.t like:
nic->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS
to enable NIC's multi-segment support for GSO.
And once enabling it, OVS might receive multi-segment
mbuf from the NIC, and OVS doesn't support it.

>
> GSO should be implemented in a same way TSO implemented.  Right now
> TSO implementation has no software fallback to segment large packets
> if netdev doesn't support - it it simply drops them.  However,
> software fallback should be implemented someday and it should be
> implemented in a generic way for all the netdev types instead of
> doing it only for one (netdev-dpdk).  Brief look at the patch tells
> me that the generic check from netdev_send_prepare_packet() was
> removed.  This means that huge packets could reach e.g. netdev-afxdp
> that will not behave correctly.
> Generic software fallback should produce sequence of 'dp_packet's
> instead of multi-segment mbufs, so there should be no need to enable
> support for multi-segment mbufs in DPDK PMD.
>
I agree, thanks!
So which one do we need, for dp_packet:
1) GSO 2) Tunneled GSO, ex: vxlan and geneve, or 3) Both?

I feel we can assume all HW nic today support TSO, but not
tunnel packet's TSO. So we only need to implement (2).
William
Ilya Maximets Feb. 9, 2021, 10:51 a.m. UTC | #7
On 2/8/21 8:54 PM, William Tu wrote:
> On Mon, Feb 8, 2021 at 6:36 AM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 2/8/21 1:56 AM, Yi Yang (杨燚)-云服务集团 wrote:
>>> Yes, GSO  is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue.
>>>
>>> -----邮件原件-----
>>> 发件人: William Tu [mailto:u9012063@gmail.com]
>>> 发送时间: 2021年2月7日 23:46
>>> 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com>
>>> 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org
>>> 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path
>>>
>>> On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote:
>>>>
>>>> -----邮件原件-----
>>>> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets
>>>> 发送时间: 2020年10月27日 21:03
>>>> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
>>>> 抄送: fbl@sysclose.org; i.maximets@ovn.org
>>>> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path
>>>>
>>>> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote:
>>>>> From: Yi Yang <yangyi01@inspur.com>
>>>>>
>>>>> GSO(Generic Segment Offload) can segment large UDP  and TCP packet
>>>>> to small packets per MTU of destination , especially for the case
>>>>> that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO,
>>>>> GSO can make sure userspace TSO can still work but not drop.
>>>>>
>>>>> In addition, GSO can help improve UDP performane when UFO is enabled
>>>>> in VM.
>>>>>
>>>>> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx
>>>>> function of physical NIC.
>>>>>
>>>>> Signed-off-by: Yi Yang <yangyi01@inspur.com>
>>>>> ---
>>>>>  lib/dp-packet.h    |  21 +++-
>>>>>  lib/netdev-dpdk.c  | 358
>>>>> +++++++++++++++++++++++++++++++++++++++++++++++++----
>>>>>  lib/netdev-linux.c |  17 ++-
>>>>>  lib/netdev.c       |  67 +++++++---
>>>>>  4 files changed, 417 insertions(+), 46 deletions(-)
>>>
>>> snip
>>>
>>>>>
>>>>> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
>>>>>      return cnt;
>>>>>  }
>>>>>
>>>>> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes
>>>>> ownership of
>>>>> - * 'pkts', even in case of failure.
>>>>> - *
>>>>> - * Returns the number of packets that weren't transmitted. */
>>>>> static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>>>>> -                         struct rte_mbuf **pkts, int cnt)
>>>>> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>>>>> +                           struct rte_mbuf **pkts, int cnt)
>>>>>  {
>>>>>      uint32_t nb_tx = 0;
>>>>> -    uint16_t nb_tx_prep = cnt;
>>>>> +    uint32_t nb_tx_prep;
>>>>>
>>>>> -    if (userspace_tso_enabled()) {
>>>>> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
>>>>> -        if (nb_tx_prep != cnt) {
>>>>> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
>>>>> -                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
>>>>> -                         cnt, rte_strerror(rte_errno));
>>>>> -        }
>>>>> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
>>>>> +    if (nb_tx_prep != cnt) {
>>>>> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
>>>>> +                          "Only %u/%u are valid: %s",
>>>>> +                     dev->up.name, nb_tx_prep,
>>>>> +                     cnt, rte_strerror(rte_errno));
>>>>>      }
>>>>>
>>>>>      while (nb_tx != nb_tx_prep) {
>>>>> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
>>>>>      return cnt - nb_tx;
>>>>>  }
>>>>>
>>>>> +static inline void
>>>>> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
>>>>
>>>> I didn't review the patch, only had a quick glance, but this part bothers me.  OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS.  So, I do not understand why this function needs to work with such mbufs.
>>>>
>>>> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
>>>>
>>>
>>> Hi Ilya,
>>>
>>> Now I understand Yi Yang's point better and I agree with him.
>>> Looks like the patch does the GSO at the DPDK TX function.
>>> It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK?
>>
>> Hi.  GSO itself should be possible to implement, but we should
>> not enable support for multi-segment mbufs in the NIC, since this
>> might end up in multi-segment packets being received by OVS
>> causing lots of trouble (for example dp_packet_clone() uses simple
>> memcpy() that will result in invalid memory reads.)
> 
> I see your point, thanks!
> I guess you're saying that we have to do s.t like:
> nic->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS
> to enable NIC's multi-segment support for GSO.
> And once enabling it, OVS might receive multi-segment
> mbuf from the NIC, and OVS doesn't support it.

OK.  This probably is not true.  I missed the fact that MULTI_SEGS
enabled only for tx offloads.  So, this should work, I guess.

But still, software fallback should not be implemented only for
netdev-dpdk.  It's needed for other port types too and it should
not depend on DPDK.

> 
>>
>> GSO should be implemented in a same way TSO implemented.  Right now
>> TSO implementation has no software fallback to segment large packets
>> if netdev doesn't support - it it simply drops them.  However,
>> software fallback should be implemented someday and it should be
>> implemented in a generic way for all the netdev types instead of
>> doing it only for one (netdev-dpdk).  Brief look at the patch tells
>> me that the generic check from netdev_send_prepare_packet() was
>> removed.  This means that huge packets could reach e.g. netdev-afxdp
>> that will not behave correctly.
>> Generic software fallback should produce sequence of 'dp_packet's
>> instead of multi-segment mbufs, so there should be no need to enable
>> support for multi-segment mbufs in DPDK PMD.
>>
> I agree, thanks!
> So which one do we need, for dp_packet:
> 1) GSO 2) Tunneled GSO, ex: vxlan and geneve, or 3) Both?
> 
> I feel we can assume all HW nic today support TSO, but not
> tunnel packet's TSO. So we only need to implement (2).

Well, we need everything.  We can't depend on fact that all ports
supports even TSO since this is under control of a VM configuration in
case of a VM attached via vhost-user port.  AF_XDP ports doesn't
support TSO, some DPDK vdevs probably doesn't support TSO.
If OVS is running inside a VM, ports might not support TSO,
since this depends on what is the underlying port implementation and
how the VM was configured.
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 79895f2..c33868d 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -83,6 +83,8 @@  enum dp_packet_offload_mask {
     DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800),
     /* VXLAN TCP Segmentation Offload. */
     DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000),
+    /* UDP Segmentation Offload. */
+    DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000),
     /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
 };
 
@@ -97,7 +99,8 @@  enum dp_packet_offload_mask {
                                      DP_PACKET_OL_TX_IPV6          | \
                                      DP_PACKET_OL_TX_TCP_CKSUM     | \
                                      DP_PACKET_OL_TX_UDP_CKSUM     | \
-                                     DP_PACKET_OL_TX_SCTP_CKSUM)
+                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
+                                     DP_PACKET_OL_TX_UDP_SEG)
 
 #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
                                  DP_PACKET_OL_TX_UDP_CKSUM | \
@@ -956,6 +959,13 @@  dp_packet_hwol_is_tso(const struct dp_packet *b)
     return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG);
 }
 
+/* Returns 'true' if packet 'b' is marked for UDP segmentation offloading. */
+static inline bool
+dp_packet_hwol_is_uso(const struct dp_packet *b)
+{
+    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG);
+}
+
 /* Returns 'true' if packet 'b' is marked for IPv4 checksum offloading. */
 static inline bool
 dp_packet_hwol_is_ipv4(const struct dp_packet *b)
@@ -1034,6 +1044,15 @@  dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
     *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
 }
 
+/* Mark packet 'b' for UDP segmentation offloading.  It implies that
+ * either the packet 'b' is marked for IPv4 or IPv6 checksum offloading
+ * and also for UDP checksum offloading. */
+static inline void
+dp_packet_hwol_set_udp_seg(struct dp_packet *b)
+{
+    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG;
+}
+
 #ifdef DPDK_NETDEV
 /* Mark packet 'b' for VXLAN TCP segmentation offloading. */
 static inline void
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 30493ed..888a45e 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -38,13 +38,15 @@ 
 #include <rte_errno.h>
 #include <rte_ethdev.h>
 #include <rte_flow.h>
+#include <rte_gso.h>
+#include <rte_ip.h>
 #include <rte_malloc.h>
 #include <rte_mbuf.h>
 #include <rte_meter.h>
 #include <rte_pci.h>
 #include <rte_version.h>
 #include <rte_vhost.h>
-#include <rte_ip.h>
+#include <rte_ip_frag.h>
 
 #include "cmap.h"
 #include "coverage.h"
@@ -162,6 +164,7 @@  typedef uint16_t dpdk_port_t;
                                    | DEV_TX_OFFLOAD_UDP_CKSUM    \
                                    | DEV_TX_OFFLOAD_IPV4_CKSUM)
 
+#define MAX_GSO_MBUFS 64
 
 static const struct rte_eth_conf port_conf = {
     .rxmode = {
@@ -2171,6 +2174,16 @@  is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev)
     return ret;
 }
 
+static uint16_t
+get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype)
+{
+        if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) {
+                return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
+        } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
+                return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
+        }
+}
+
 /* Prepare the packet for HWOL.
  * Return True if the packet is OK to continue. */
 static bool
@@ -2203,10 +2216,9 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
          * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared
          * outer_l2_len and outer_l3_len must be zeroed.
          */
-        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)
-            || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
+        if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG))
             && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
-                + mbuf->l2_len)))  {
+                + mbuf->l2_len))  {
             mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN;
             mbuf->l2_len -= sizeof(struct udp_header)
                         + sizeof(struct vxlanhdr);
@@ -2249,7 +2261,7 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
             /* inner IP checksum offload offload */
             mbuf->ol_flags |= PKT_TX_IP_CKSUM;
         }
-    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
+    } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) {
         /* Handle VLAN TSO */
             /* no inner IP checksum for IPV6 */
         if (mbuf->ol_flags & PKT_TX_IPV4) {
@@ -2273,6 +2285,18 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
         mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
         mbuf->outer_l2_len = 0;
         mbuf->outer_l3_len = 0;
+
+        /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO
+         * APIs, here is a place we can mark it.
+         */
+        if ((mbuf->pkt_len > 1464)
+            && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) {
+            if (l4_proto == IPPROTO_UDP) {
+                mbuf->ol_flags |= PKT_TX_UDP_SEG;
+            } else if (l4_proto == IPPROTO_TCP) {
+                mbuf->ol_flags |= PKT_TX_TCP_SEG;
+            }
+        }
     }
 
     /* It is possible that l4_len isn't set for vhostuserclient */
@@ -2284,6 +2308,10 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
         mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
     }
 
+    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
+        return true;
+    }
+
     if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
         if (l4_proto != IPPROTO_UDP) {
             VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
@@ -2294,11 +2322,13 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
                mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
         if (l4_proto != IPPROTO_TCP) {
             VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
-                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
+                         " pkt len: %"PRIu32" l4_proto = %d",
+                         dev->up.name, mbuf->pkt_len, l4_proto);
             return false;
         }
 
-        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
+        if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len
+            + mbuf->l2_len) {
             dp_packet_hwol_set_tcp_seg(pkt);
         }
 
@@ -2308,7 +2338,66 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
         } else {
             mbuf->tso_segsz = 0;
         }
+
+        if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
+            /* PKT_TX_TCP_CKSUM must be cleaned for GSO because
+             * tcp checksum only can be caculated by software for
+             * GSO case.
+             */
+            mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM;
+        }
     }
+
+    /* UDP GSO if necessary */
+    if (l4_proto == IPPROTO_UDP) {
+        /* VXLAN GSO can be done here */
+        if ((mbuf->ol_flags & PKT_TX_UDP_SEG) ||
+            (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
+                             mbuf->l2_len))) {
+            dp_packet_hwol_set_udp_seg(pkt);
+
+            /* For UDP GSO, udp checksum must be calculated by software */
+            if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
+                void *l3_hdr, *l4_hdr;
+                struct rte_udp_hdr *udp_hdr;
+
+                /* PKT_TX_UDP_CKSUM must be cleaned for GSO because
+                 * udp checksum only can be caculated by software for
+                 * GSO case.
+                 */
+                mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM;
+
+                eth_hdr = (struct rte_ether_hdr *)
+                    ((uint8_t *)eth_hdr + mbuf->outer_l2_len +
+                               mbuf->outer_l3_len +
+                               sizeof(struct udp_header) +
+                               sizeof(struct vxlanhdr));
+                l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len -
+                         sizeof(struct udp_header) -
+                         sizeof(struct vxlanhdr);
+                l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len;
+                ip_hdr = (struct rte_ipv4_hdr *)l3_hdr;
+                ip_hdr->hdr_checksum = 0;
+                ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr);
+                /* Don't touch UDP checksum if it is ip fragment */
+                if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) {
+                    udp_hdr = (struct rte_udp_hdr *)l4_hdr;
+                    udp_hdr->dgram_cksum = 0;
+                    udp_hdr->dgram_cksum =
+                        get_udptcp_checksum(l3_hdr, l4_hdr,
+                                            eth_hdr->ether_type);
+                }
+            }
+
+            /* FOR GSO, gso_size includes l2_len + l3_len */
+            mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len +
+                              mbuf->l2_len;
+            if (mbuf->tso_segsz > dev->mtu) {
+                mbuf->tso_segsz = dev->mtu;
+            }
+        }
+    }
+
     return true;
 }
 
@@ -2339,24 +2428,19 @@  netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts,
     return cnt;
 }
 
-/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
- * 'pkts', even in case of failure.
- *
- * Returns the number of packets that weren't transmitted. */
 static inline int
-netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
-                         struct rte_mbuf **pkts, int cnt)
+__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
+                           struct rte_mbuf **pkts, int cnt)
 {
     uint32_t nb_tx = 0;
-    uint16_t nb_tx_prep = cnt;
+    uint32_t nb_tx_prep;
 
-    if (userspace_tso_enabled()) {
-        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
-        if (nb_tx_prep != cnt) {
-            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
-                         "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,
-                         cnt, rte_strerror(rte_errno));
-        }
+    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
+    if (nb_tx_prep != cnt) {
+        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
+                          "Only %u/%u are valid: %s",
+                     dev->up.name, nb_tx_prep,
+                     cnt, rte_strerror(rte_errno));
     }
 
     while (nb_tx != nb_tx_prep) {
@@ -2384,6 +2468,200 @@  netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
     return cnt - nb_tx;
 }
 
+static inline void
+set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf)
+{
+    uint16_t l3_offset = mbuf->outer_l2_len + mbuf->outer_l3_len
+                         + mbuf->l2_len;
+    struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
+        (rte_pktmbuf_mtod(mbuf, char *) + l3_offset);
+    struct rte_tcp_hdr *tcp_hdr;
+    uint32_t l4_hdr_len;
+    uint8_t *l4_hdr;
+    struct rte_mbuf *next = mbuf->next;
+    uint32_t cksum = 0;
+    uint16_t l4_proto;
+    uint32_t inner_cksum;
+
+    l4_proto = ipv4_hdr->next_proto_id;
+    if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) {
+        return;
+    }
+
+    if (l4_proto == IPPROTO_TCP) {
+        /* For TCP GSO, inner TCP header is in every seg,
+         * TCP checksum has to be calculated by software.
+         */
+
+        l4_hdr_len = mbuf->data_len - l3_offset
+                     - sizeof(struct rte_ipv4_hdr);
+        l4_hdr = (uint8_t *)(ipv4_hdr + 1);
+        tcp_hdr = (struct rte_tcp_hdr *)l4_hdr;
+        tcp_hdr->cksum = 0;
+    }
+
+    /* Set inner ip checksum */
+    ipv4_hdr->hdr_checksum = 0;
+    ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+
+    if (l4_proto == IPPROTO_TCP) {
+        cksum = rte_raw_cksum(l4_hdr, l4_hdr_len);
+    } else if (l4_proto == IPPROTO_UDP) {
+        if (next == NULL) {
+            /* It wasn't GSOed */
+            cksum = rte_raw_cksum(ipv4_hdr + 1,
+                                  ntohs(ipv4_hdr->total_length)
+                                      - sizeof(struct rte_ipv4_hdr));
+        } else {
+            cksum = 0;
+        }
+    }
+
+    /* It was GSOed */
+    while (next) {
+        cksum += rte_raw_cksum(rte_pktmbuf_mtod(next, char *), next->data_len);
+        next = next->next;
+    }
+
+    /* Save cksum to inner_cksum, outer udp checksum needs it */
+    inner_cksum = cksum;
+
+    cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0);
+    cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
+    cksum = (~cksum) & 0xffff;
+    if (cksum == 0) {
+        cksum = 0xffff;
+    }
+
+    /* Set inner TCP checksum */
+    if (l4_proto == IPPROTO_TCP) {
+        tcp_hdr->cksum = (uint16_t)cksum;
+    }
+
+    /* Set outer udp checksum in case of VXLAN */
+    if (mbuf->outer_l2_len != 0) {
+        ipv4_hdr = (struct rte_ipv4_hdr *)
+            (rte_pktmbuf_mtod(mbuf, char *) + mbuf->outer_l2_len);
+        struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *)
+            (ipv4_hdr + 1);
+
+        /* Set outer ip checksum */
+        ipv4_hdr->hdr_checksum = 0;
+        ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+
+        udp_hdr->dgram_cksum = 0;
+        cksum = rte_ipv4_phdr_cksum(ipv4_hdr, 0);
+        cksum += rte_raw_cksum(udp_hdr, mbuf->l2_len + mbuf->l3_len);
+        cksum += inner_cksum;
+        if (l4_proto == IPPROTO_TCP) {
+            cksum += tcp_hdr->cksum;
+        }
+        cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
+        cksum = (~cksum) & 0xffff;
+        if (cksum == 0) {
+            cksum = 0xffff;
+        }
+        udp_hdr->dgram_cksum = (uint16_t)cksum;
+    }
+}
+
+/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'.  Takes ownership of
+ * 'pkts', even in case of failure.
+ *
+ * Returns the number of packets that weren't transmitted. */
+static inline int
+netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid,
+                         struct rte_mbuf **pkts, int cnt)
+{
+    uint32_t nb_tx = 0;
+    int i;
+    int ret;
+    int failures = 0;
+
+    if (userspace_tso_enabled()) {
+        /* The best point to do gso */
+        struct rte_gso_ctx gso_ctx;
+        struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+        int tx_start = -1;
+
+        /* Setup gso context */
+        gso_ctx.direct_pool = dev->dpdk_mp->mp;
+        gso_ctx.indirect_pool = dev->dpdk_mp->mp;
+
+        /* Do GSO if needed */
+        for (i = 0; i < cnt; i++) {
+            if (((pkts[i]->ol_flags & PKT_TX_UDP_SEG) &&
+                  !(dev->hw_ol_features & DEV_TX_OFFLOAD_UDP_TSO)) ||
+                ((pkts[i]->ol_flags & PKT_TX_TCP_SEG) &&
+                  ((!(dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD)
+                   && (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN))
+                   || !(dev->hw_ol_features & DEV_TX_OFFLOAD_TCP_TSO)))) {
+                /* Send non GSO packets before pkts[i] */
+                if (tx_start != -1) {
+                    failures += __netdev_dpdk_eth_tx_burst(
+                                    dev, qid,
+                                    pkts + tx_start,
+                                    i - tx_start);
+                }
+                tx_start = -1;
+
+                gso_ctx.gso_types = 0;
+                gso_ctx.gso_size = pkts[i]->tso_segsz;
+                gso_ctx.flag = 0;
+                if (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN) {
+                    gso_ctx.gso_types |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
+                }
+                if (pkts[i]->ol_flags & PKT_TX_UDP_SEG) {
+                    gso_ctx.gso_types |= DEV_TX_OFFLOAD_UDP_TSO;
+                } else if (pkts[i]->ol_flags & PKT_TX_TCP_SEG) {
+                    gso_ctx.gso_types |= DEV_TX_OFFLOAD_TCP_TSO;
+                    pkts[i]->ol_flags &= ~PKT_TX_TCP_CKSUM;
+                }
+                ret = rte_gso_segment(pkts[i], /* packet to segment */
+                                      &gso_ctx, /* gso context */
+                                      /* gso output mbufs */
+                                      (struct rte_mbuf **)&gso_mbufs,
+                                      MAX_GSO_MBUFS);
+                if (ret < 0) {
+                    rte_pktmbuf_free(pkts[i]);
+                } else {
+                    int j, k;
+                    struct rte_mbuf * next_part;
+                    nb_tx = ret;
+                    for (j = 0; j < nb_tx; j++) {
+                        set_multiseg_udptcp_cksum(gso_mbufs[j]);
+                        /* Clear them because of no offload */
+                        gso_mbufs[j]->ol_flags = 0;
+                        gso_mbufs[j]->outer_l2_len = 0;
+                        gso_mbufs[j]->outer_l3_len = 0;
+                        gso_mbufs[j]->l2_len = 0;
+                        gso_mbufs[j]->l3_len = 0;
+                        gso_mbufs[j]->l4_len = 0;
+                        next_part = gso_mbufs[j];
+                        for (k = 0; k < gso_mbufs[j]->nb_segs; k++) {
+                            next_part = next_part->next;
+                        }
+                    }
+                    __netdev_dpdk_eth_tx_burst(dev, qid, gso_mbufs, nb_tx);
+                }
+                continue;
+            }
+            if (tx_start == -1) {
+                tx_start = i;
+            }
+        }
+
+        if (tx_start != -1) {
+            /* Send non GSO packets before pkts[i] */
+            failures += __netdev_dpdk_eth_tx_burst(dev, qid, pkts + tx_start,
+                                       i - tx_start);
+        }
+        return failures;
+    }
+
+    return __netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt);
+}
+
 static inline bool
 netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter,
                                      struct rte_meter_srtcm_profile *profile,
@@ -2786,10 +3064,24 @@  out:
     }
 }
 
+struct shinfo_arg {
+    void * buf;
+    struct rte_mbuf *mbuf;
+};
+
+/* For GSO case, the extended mbuf only can be freed by
+ * netdev_dpdk_extbuf_free
+ */
 static void
-netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque)
+netdev_dpdk_extbuf_free(struct rte_mbuf *m, void *opaque)
 {
-    rte_free(opaque);
+    struct shinfo_arg *arg = (struct shinfo_arg *)opaque;
+
+    rte_free(arg->buf);
+    if (m != arg->mbuf) {
+        rte_pktmbuf_free(arg->mbuf);
+    }
+    free(arg);
 }
 
 static struct rte_mbuf *
@@ -2821,8 +3113,11 @@  dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len)
 
     /* Initialize shinfo. */
     if (shinfo) {
+        struct shinfo_arg *arg = xmalloc(sizeof(struct shinfo_arg));
+        arg->buf = buf;
+        arg->mbuf = pkt;
         shinfo->free_cb = netdev_dpdk_extbuf_free;
-        shinfo->fcb_opaque = buf;
+        shinfo->fcb_opaque = arg;
         rte_mbuf_ext_refcnt_set(shinfo, 1);
     } else {
         shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len,
@@ -2852,6 +3147,10 @@  dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len)
         return NULL;
     }
 
+    if (unlikely(pkt->shinfo != NULL)) {
+        pkt->shinfo = NULL;
+    }
+
     if (rte_pktmbuf_tailroom(pkt) >= data_len) {
         return pkt;
     }
@@ -5192,6 +5491,7 @@  netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
     int err;
     uint64_t vhost_flags = 0;
     uint64_t vhost_unsup_flags;
+    uint64_t vhost_supported_flags;
     bool zc_enabled;
 
     ovs_mutex_lock(&dev->mutex);
@@ -5277,6 +5577,16 @@  netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
             goto unlock;
         }
 
+        err = rte_vhost_driver_get_features(dev->vhost_id,
+                                            &vhost_supported_flags);
+        if (err) {
+            VLOG_ERR("rte_vhost_driver_get_features failed for "
+                     "vhost user client port: %s\n", dev->up.name);
+            goto unlock;
+        }
+        VLOG_INFO("vhostuserclient port %s features: 0x%016lx",
+                  dev->up.name, vhost_supported_flags);
+
         err = rte_vhost_driver_start(dev->vhost_id);
         if (err) {
             VLOG_ERR("rte_vhost_driver_start failed for vhost user "
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 9f830b4..557f139 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -6566,6 +6566,16 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
 
         l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
         dp_packet_hwol_set_l4_len(b, l4_len);
+    } else if (*l4proto == IPPROTO_UDP) {
+        struct udp_header *udp_hdr =  dp_packet_at(b, l2_len + l3_len,
+                                          sizeof(struct udp_header));
+
+        if (!udp_hdr) {
+            return -EINVAL;
+        }
+
+        l4_len = sizeof(struct udp_header);
+        dp_packet_hwol_set_l4_len(b, l4_len);
     }
 
     return 0;
@@ -6581,10 +6591,6 @@  netdev_linux_parse_vnet_hdr(struct dp_packet *b)
         return -EINVAL;
     }
 
-    if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) {
-        return 0;
-    }
-
     if (netdev_linux_parse_l2(b, &l4proto)) {
         return -EINVAL;
     }
@@ -6609,6 +6615,9 @@  netdev_linux_parse_vnet_hdr(struct dp_packet *b)
             || type == VIRTIO_NET_HDR_GSO_TCPV6) {
             dp_packet_hwol_set_tcp_seg(b);
         }
+        if (type == VIRTIO_NET_HDR_GSO_UDP) {
+            dp_packet_hwol_set_udp_seg(b);
+        }
     }
 
     return 0;
diff --git a/lib/netdev.c b/lib/netdev.c
index 64583d1..02f28c8 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -825,23 +825,41 @@  netdev_send_prepare_packet(const uint64_t netdev_flags,
 {
     uint64_t l4_mask;
 
-    if (dp_packet_hwol_is_vxlan_tcp_seg(packet)
-        && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet))
-        && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) {
-        /* Fall back to GSO in software. */
-        VLOG_ERR_BUF(errormsg, "No VXLAN TSO support");
-        return false;
-    }
-
-    if (dp_packet_hwol_is_tso(packet)
-        && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) {
-            /* Fall back to GSO in software. */
-            VLOG_ERR_BUF(errormsg, "No TSO support");
-            return false;
-    }
-
+    /* GSO can handle TSO by software even if device can't handle hardware
+     * offload, so needn't check it here.
+     */
     l4_mask = dp_packet_hwol_l4_mask(packet);
     if (l4_mask) {
+        /* Calculate checksum for VLAN TSO case when no hardware offload
+         * feature is available. Note: for VXLAN TSO case, checksum has
+         * been calculated before here, so it won't be done here again
+         * because checksum flags in packet->m.ol_flags have been cleaned.
+         */
+        if (dp_packet_hwol_l4_is_tcp(packet)
+            && !dp_packet_hwol_is_vxlan_tcp_seg(packet)
+            && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
+            packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM;
+            /* Only calculate TCP checksum for non-TSO packet,
+             * it will be calculated after GSO for TSO packet.
+             */
+            if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) {
+                calculate_tcpudp_checksum(packet);
+            }
+            return true;
+        } else if (dp_packet_hwol_l4_is_udp(packet)
+            && !dp_packet_hwol_is_vxlan_tcp_seg(packet)
+            && !(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) {
+            packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM;
+            /* Only calculate UDP checksum for non-UFO packet,
+             * it will be calculated immediately before GSO for
+             * UFO packet.
+             */
+            if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) {
+                calculate_tcpudp_checksum(packet);
+            }
+            return true;
+        }
+
         if (dp_packet_hwol_l4_is_tcp(packet)) {
             if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) {
                 /* Fall back to TCP csum in software. */
@@ -1013,11 +1031,26 @@  netdev_push_header(const struct netdev *netdev,
                 /* VXLAN offload can't support udp checksum offload
                  * for inner udp packet, so udp checksum must be set
                  * before push header in order that outer checksum can
-                 * be set correctly.
+                 * be set correctly. But GSO code will set udp checksum
+                 * if packet->mbuf.ol_flags has DP_PACKET_OL_TX_UDP_SEG.
                  */
                 if (dp_packet_hwol_l4_is_udp(packet)) {
                     packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM;
-                    calculate_tcpudp_checksum(packet);
+                    /* Only calculate UDP checksum for non-UFO packet,
+                     * it will be calculated immediately before GSO for
+                     * UFO packet.
+                     */
+                    if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) {
+                        calculate_tcpudp_checksum(packet);
+                    }
+                } else if (dp_packet_hwol_l4_is_tcp(packet)) {
+                    packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM;
+                    /* Only calculate TCP checksum for non-TSO packet,
+                     * it will be calculated after GSO for TSO packet.
+                     */
+                    if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) {
+                        calculate_tcpudp_checksum(packet);
+                    }
                 }
             }
             netdev->netdev_class->push_header(netdev, packet, data);