diff mbox series

[ovs-dev,v2,2/5] Enable VXLAN TSO for DPDK datapath

Message ID 20200701091533.221552-3-yang_y_yi@163.com
State Superseded
Headers show
Series userspace: enable VXLAN TSO, GSO and GRO | expand

Commit Message

yang_y_yi July 1, 2020, 9:15 a.m. UTC
From: Yi Yang <yangyi01@inspur.com>

Many NICs can support VXLAN TSO which can help
improve across-compute-node VM-to-VM performance
in case that MTU is set to 1500.

This patch allows dpdkvhostuserclient interface
and veth/tap interface to leverage NICs' offload
capability to maximize across-compute-node TCP
performance, with it applied, OVS DPDK can reach
linespeed for across-compute-node VM-to-VM TCP
performance.

Signed-off-by: Yi Yang <yangyi01@inspur.com>
---
 lib/dp-packet.h    |  61 +++++++++++++++++
 lib/netdev-dpdk.c  | 193 +++++++++++++++++++++++++++++++++++++++++++++++++----
 lib/netdev-linux.c |  20 ++++++
 lib/netdev.c       |  14 ++--
 4 files changed, 271 insertions(+), 17 deletions(-)

Comments

Flavio Leitner July 10, 2020, 8:50 p.m. UTC | #1
Hi Yi,

This is not a full review, but netdev-dpdk.c is used by Window
and BSD as well, and there is a 'linux' function which seems
to be a copy of another existing one. Perhaps we can use just one?

This patch resets ol_flags from vhostuser ignoring what has
been set by rte_vhost_dequeue_burst(). What happens if a VM
turns off offloading? Also that it is always enabled while
userspace offloading is experimental and default to off.

Why do we need to set l2_len, l3_len and l4_len when receiving
from the VM? Those are not used by OVS and if the packet
changes during the pipeline execution, they will need to be
updated at the appropriate prepare function, which for dpdk is
netdev_dpdk_prep_hwol_packet().

Few more comments below. 

Thanks!
fbl

On Wed, Jul 01, 2020 at 05:15:30PM +0800, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> Many NICs can support VXLAN TSO which can help
> improve across-compute-node VM-to-VM performance
> in case that MTU is set to 1500.
> 
> This patch allows dpdkvhostuserclient interface
> and veth/tap interface to leverage NICs' offload
> capability to maximize across-compute-node TCP
> performance, with it applied, OVS DPDK can reach
> linespeed for across-compute-node VM-to-VM TCP
> performance.
> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>  lib/dp-packet.h    |  61 +++++++++++++++++
>  lib/netdev-dpdk.c  | 193 +++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/netdev-linux.c |  20 ++++++
>  lib/netdev.c       |  14 ++--
>  4 files changed, 271 insertions(+), 17 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 070d111..07af124 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -1034,6 +1034,67 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>  }
>  
> +#ifdef DPDK_NETDEV
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +    b->mbuf.l2_len += sizeof(struct udp_header) +
> +                      sizeof(struct vxlanhdr);
> +    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
> +    b->mbuf.outer_l3_len = IP_HEADER_LEN;

What about IPv6?


> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +#else
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b OVS_UNUSED)
> +{
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b OVS_UNUSED,
> +                          int l2_len OVS_UNUSED)
> +{
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b OVS_UNUSED,
> +                          int l3_len OVS_UNUSED)
> +{
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED,
> +                          int l4_len OVS_UNUSED)
> +{
> +}
> +#endif /* DPDK_NETDEV */
> +
>  static inline bool
>  dp_packet_ip_checksum_valid(const struct dp_packet *p)
>  {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 44ebf96..bf5fa63 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -44,6 +44,7 @@
>  #include <rte_pci.h>
>  #include <rte_version.h>
>  #include <rte_vhost.h>
> +#include <rte_ip.h>

We have network headers definitions in OVS and we should
give preference to them.


>  #include "cmap.h"
>  #include "coverage.h"
> @@ -87,6 +88,7 @@ COVERAGE_DEFINE(vhost_notification);
>  
>  #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
>  #define OVS_VPORT_DPDK "ovs_dpdk"
> +#define DPDK_RTE_HDR_OFFSET 1

Perhaps HDR_NEXT_OFFSET defined somewhere more generic
because it's neither RTE nor DPDK specific?

>  
>  /*
>   * need to reserve tons of extra space in the mbufs so we can align the
> @@ -405,6 +407,7 @@ enum dpdk_hw_ol_features {
>      NETDEV_RX_HW_SCATTER = 1 << 2,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 3,
>      NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
>  };
>  
>  /*
> @@ -988,6 +991,12 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>  
>      if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>          conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
> +        }
>          if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>              conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
>          }
> @@ -1126,6 +1135,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          if ((info.tx_offload_capa & tx_tso_offload_capa)
>              == tx_tso_offload_capa) {
>              dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> +            /* Enable VXLAN TSO support if available */
> +            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +            }
>              if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
>                  dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
>              } else {
> @@ -2137,29 +2150,96 @@ static bool
>  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>  {
>      struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> +    uint16_t l4_proto = 0;
> +    struct rte_ether_hdr *eth_hdr =
> +        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    struct rte_ipv4_hdr *ip_hdr;
> +    struct rte_ipv6_hdr *ip6_hdr;
> +
> +    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +        /* Handle VXLAN TSO */
> +        struct rte_udp_hdr *udp_hdr;
> +
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip_hdr = (struct rte_ipv4_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* inner IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip6_hdr = (struct rte_ipv6_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* inner IP checksum offload offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
> +    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +        /* Handle VLAN TSO */
> +            /* no inner IP checksum for IPV6 */
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* IP checksum offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
>  
> -    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
>          mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
>          mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>          mbuf->outer_l2_len = 0;
>          mbuf->outer_l3_len = 0;
>      }
>  
> -    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> -        struct tcp_header *th = dp_packet_l4(pkt);
> -
> -        if (!th) {
> +    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
> +        if (l4_proto != IPPROTO_UDP) {
> +            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> +                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            return false;
> +        }
> +        /* VXLAN GSO can be done here */
> +    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
> +               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> +        if (l4_proto != IPPROTO_TCP) {
>              VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
>                           " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>              return false;
>          }
>  
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> -        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +            dp_packet_hwol_set_tcp_seg(pkt);
> +        }
>  
> -        if (mbuf->ol_flags & PKT_TX_IPV4) {
> -            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> +        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> +            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
> +        } else {
> +            mbuf->tso_segsz = 0;
>          }
>      }
>      return true;
> @@ -2365,6 +2445,71 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
>      }
>  }
>  
> +static void
> +netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
> +{
> +    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
> +    struct rte_ether_hdr *eth_hdr =
> +                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    ovs_be16 eth_type;
> +    int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;
> +
> +    l2_len = ETH_HEADER_LEN;
> +    eth_type = (OVS_FORCE ovs_be16) eth_hdr->ether_type;
> +    if (eth_type_vlan(eth_type)) {
> +        struct rte_vlan_hdr *vlan_hdr =
> +                        (struct rte_vlan_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
> +
> +        eth_type = (OVS_FORCE ovs_be16) vlan_hdr->eth_proto;
> +        l2_len += VLAN_HEADER_LEN;
> +    }
> +
> +    dp_packet_hwol_set_l2_len(pkt, l2_len);
> +
> +    if (eth_type == htons(ETH_TYPE_IP)) {
> +        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +
> +        l3_len = IP_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv4(pkt);
> +        *l4_proto = ipv4_hdr->next_proto_id;
> +    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
> +        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +        l3_len = IPV6_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv6(pkt);
> +        *l4_proto = ipv6_hdr->proto;
> +    }
> +
> +    dp_packet_hwol_set_l3_len(pkt, l3_len);
> +
> +    if (*l4_proto == IPPROTO_TCP) {
> +        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
> +            ((char *)eth_hdr + l2_len + l3_len);
> +
> +        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> +        dp_packet_hwol_set_l4_len(pkt, l4_len);
> +    }
> +}
> +
> +static void
> +netdev_dpdk_parse_hdr(struct dp_packet *b)
> +{
> +    uint16_t l4_proto = 0;
> +
> +    netdev_linux_parse_l2(b, &l4_proto);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        dp_packet_hwol_set_csum_tcp(b);
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        dp_packet_hwol_set_csum_udp(b);
> +    } else if (l4_proto == IPPROTO_SCTP) {
> +        dp_packet_hwol_set_csum_sctp(b);
> +    }
> +}
> +
>  /*
>   * The receive path for the vhost port is the TX path out from guest.
>   */
> @@ -2378,6 +2523,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>      uint16_t qos_drops = 0;
>      int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
>      int vid = netdev_dpdk_get_vid(dev);
> +    struct dp_packet *packet;
>  
>      if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>                       || !(dev->flags & NETDEV_UP))) {
> @@ -2417,6 +2563,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>      batch->count = nb_rx;
>      dp_packet_batch_init_packet_fields(batch);
>  
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
> +
> +        /* Clear ol_flags and set it by parsing header */
> +        mbuf->ol_flags = 0;
> +        netdev_dpdk_parse_hdr(packet);
> +    }
> +
>      return 0;
>  }
>  
> @@ -2737,13 +2891,18 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>  
>      mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
>      mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> -    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> -                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> +    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
> +    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
> +    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
> +    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
> +    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
> +    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
>  
>      memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>             sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>  
> -    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> +    if ((mbuf_dest->outer_l2_len == 0) &&
> +        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
>          mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
>                                  - (char *)dp_packet_eth(pkt_dest);
>          mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> @@ -2773,6 +2932,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>      uint32_t tx_failure = 0;
>      uint32_t mtu_drops = 0;
>      uint32_t qos_drops = 0;
> +    struct rte_mbuf *mbuf;
>  
>      if (dev->type != DPDK_DEV_VHOST) {
>          /* Check if QoS has been configured for this netdev. */
> @@ -2801,6 +2961,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>              break;
>          }
>  
> +        mbuf = (struct rte_mbuf *)pkts[txcnt];
> +        netdev_dpdk_prep_hwol_packet(dev, mbuf);
> +
>          txcnt++;
>      }
>  
> @@ -4949,6 +5112,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        }
>          if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>              netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
>          }
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index 6269c24..f6e80fc 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -6500,6 +6500,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>      struct eth_header *eth_hdr;
>      ovs_be16 eth_type;
>      int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;
>  
>      eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
>      if (!eth_hdr) {
> @@ -6519,6 +6521,8 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>          l2_len += VLAN_HEADER_LEN;
>      }
>  
> +    dp_packet_hwol_set_l2_len(b, l2_len);
> +
>      if (eth_type == htons(ETH_TYPE_IP)) {
>          struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
>  
> @@ -6526,6 +6530,7 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>              return -EINVAL;
>          }
>  
> +        l3_len = IP_HEADER_LEN;
>          *l4proto = ip_hdr->ip_proto;
>          dp_packet_hwol_set_tx_ipv4(b);
>      } else if (eth_type == htons(ETH_TYPE_IPV6)) {
> @@ -6536,10 +6541,25 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
>              return -EINVAL;
>          }
>  
> +        l3_len = IPV6_HEADER_LEN;
>          *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
>          dp_packet_hwol_set_tx_ipv6(b);
>      }
>  
> +    dp_packet_hwol_set_l3_len(b, l3_len);
> +
> +    if (*l4proto == IPPROTO_TCP) {
> +        struct tcp_header *tcp_hdr =  dp_packet_at(b, l2_len + l3_len,
> +                                          sizeof(struct tcp_header));
> +
> +        if (!tcp_hdr) {
> +            return -EINVAL;
> +        }
> +
> +        l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
> +        dp_packet_hwol_set_l4_len(b, l4_len);
> +    }
> +
>      return 0;
>  }
>  
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 90962ee..b437caf 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -960,15 +960,21 @@ netdev_push_header(const struct netdev *netdev,
>      size_t i, size = dp_packet_batch_size(batch);
>  
>      DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> -        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
> -                         || dp_packet_hwol_l4_mask(packet))) {
> +        if (OVS_UNLIKELY((dp_packet_hwol_is_tso(packet)
> +                          || dp_packet_hwol_l4_mask(packet))
> +                         && (data->tnl_type != OVS_VPORT_TYPE_VXLAN))) {
>              COVERAGE_INC(netdev_push_header_drops);
>              dp_packet_delete(packet);
> -            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
> -                         "not supported: packet dropped",
> +            VLOG_WARN_RL(&rl,
> +                         "%s: non-VxLAN Tunneling packets with HW offload "
> +                         "flags is not supported: packet dropped",
>                           netdev_get_name(netdev));
>          } else {
>              netdev->netdev_class->push_header(netdev, packet, data);
> +            if ((data->tnl_type == OVS_VPORT_TYPE_VXLAN)
> +                && dp_packet_hwol_is_tso(packet)) {
> +                dp_packet_hwol_set_vxlan_tcp_seg(packet);
> +            }
>              pkt_metadata_init(&packet->md, data->out_port);
>              dp_packet_batch_refill(batch, packet, i);
>          }
> -- 
> 2.7.4
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Yi Yang (杨燚)-云服务集团 July 13, 2020, 1:45 a.m. UTC | #2
Flavio, thank you so much for reviewing again, I'll fix them in next version, detailed replies inline. Sorry for bad outlook email format, I only can use outlook.

-----邮件原件-----
发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Flavio Leitner
发送时间: 2020年7月11日 4:51
收件人: yang_y_yi@163.com
抄送: ovs-dev@openvswitch.org
主题: Re: [ovs-dev] [PATCH v2 2/5] Enable VXLAN TSO for DPDK datapath


Hi Yi,

This is not a full review, but netdev-dpdk.c is used by Window and BSD as well, and there is a 'linux' function which seems to be a copy of another existing one. Perhaps we can use just one?

[Yi Yang] I have some changes against them, do you mean we can invoke functions in netdev-linux? How shall we handle them in non-Linux cases?

This patch resets ol_flags from vhostuser ignoring what has been set by rte_vhost_dequeue_burst(). What happens if a VM turns off offloading? Also that it is always enabled while userspace offloading is experimental and default to off.

[Yi Yang] I didn't realize this, will fix them in new version.

Why do we need to set l2_len, l3_len and l4_len when receiving from the VM? Those are not used by OVS and if the packet changes during the pipeline execution, they will need to be updated at the appropriate prepare function, which for dpdk is netdev_dpdk_prep_hwol_packet().

[Yi Yang] Currently, netdev_dpdk_prep_hwol_packet assumes ol_flags and l*_len have been set correctly before it is called, netdev_dpdk_prep_hwol_packet needs to use them to make some decisions. But in netdev_dpdk_prep_hwol_batch, dev is output device, not input device, so we can't decide if it is from VM or physical dpdk port. Your concern make sense, I will move it to netdev_dpdk_prep_hwol_packet,  can you propose a way to decide if the packet is from VM in netdev_dpdk_prep_hwol_packet? 

Few more comments below. 

Thanks!
fbl

On Wed, Jul 01, 2020 at 05:15:30PM +0800, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> Many NICs can support VXLAN TSO which can help improve 
> across-compute-node VM-to-VM performance in case that MTU is set to 
> 1500.
> 
> This patch allows dpdkvhostuserclient interface and veth/tap interface 
> to leverage NICs' offload capability to maximize across-compute-node 
> TCP performance, with it applied, OVS DPDK can reach linespeed for 
> across-compute-node VM-to-VM TCP performance.
> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>  lib/dp-packet.h    |  61 +++++++++++++++++
>  lib/netdev-dpdk.c  | 193 
> +++++++++++++++++++++++++++++++++++++++++++++++++----
>  lib/netdev-linux.c |  20 ++++++
>  lib/netdev.c       |  14 ++--
>  4 files changed, 271 insertions(+), 17 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 070d111..07af124 
> 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -1034,6 +1034,67 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;  }
>  
> +#ifdef DPDK_NETDEV
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static 
> +inline void dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b) {
> +    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +    b->mbuf.l2_len += sizeof(struct udp_header) +
> +                      sizeof(struct vxlanhdr);
> +    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
> +    b->mbuf.outer_l3_len = IP_HEADER_LEN;

What about IPv6?
[Yi Yang] Current DPDK GSO code can't handle ipv6 (I mean outer IP header is IPv6), I'm not sure if NIC can handle it. So far only IPv4 is verified, I will add check code here after I confirm it.


> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len) {
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len) {
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len) {
> +    b->mbuf.l4_len = l4_len;
> +}
> +#else
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static 
> +inline void dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b 
> +OVS_UNUSED) { }
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b OVS_UNUSED,
> +                          int l2_len OVS_UNUSED) { }
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b OVS_UNUSED,
> +                          int l3_len OVS_UNUSED) { }
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED,
> +                          int l4_len OVS_UNUSED) { } #endif /* 
> +DPDK_NETDEV */
> +
>  static inline bool
>  dp_packet_ip_checksum_valid(const struct dp_packet *p)  { diff --git 
> a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 44ebf96..bf5fa63 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -44,6 +44,7 @@
>  #include <rte_pci.h>
>  #include <rte_version.h>
>  #include <rte_vhost.h>
> +#include <rte_ip.h>

We have network headers definitions in OVS and we should give preference to them.

[Yi Yang] But here it is DPDK only code, such some APIs as rte_ipv4_frag_pkt_is_fragmented need DPDK-style header, so I think it will be better to use DPDK headers to adapt to these APIs.

>  #include "cmap.h"
>  #include "coverage.h"
> @@ -87,6 +88,7 @@ COVERAGE_DEFINE(vhost_notification);
>  
>  #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE  #define OVS_VPORT_DPDK 
> "ovs_dpdk"
> +#define DPDK_RTE_HDR_OFFSET 1

Perhaps HDR_NEXT_OFFSET defined somewhere more generic because it's neither RTE nor DPDK specific?
[Yi Yang] Ok, maybe it is better to define it immediately before the function using it.
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 070d111..07af124 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -1034,6 +1034,67 @@  dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
     *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
 }
 
+#ifdef DPDK_NETDEV
+/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
+static inline void
+dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
+{
+    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
+    b->mbuf.l2_len += sizeof(struct udp_header) +
+                      sizeof(struct vxlanhdr);
+    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
+    b->mbuf.outer_l3_len = IP_HEADER_LEN;
+}
+
+/* Set l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
+{
+    b->mbuf.l2_len = l2_len;
+}
+
+/* Set l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
+{
+    b->mbuf.l3_len = l3_len;
+}
+
+/* Set l4_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
+{
+    b->mbuf.l4_len = l4_len;
+}
+#else
+/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
+static inline void
+dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b OVS_UNUSED)
+{
+}
+
+/* Set l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l2_len(struct dp_packet *b OVS_UNUSED,
+                          int l2_len OVS_UNUSED)
+{
+}
+
+/* Set l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l3_len(struct dp_packet *b OVS_UNUSED,
+                          int l3_len OVS_UNUSED)
+{
+}
+
+/* Set l4_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED,
+                          int l4_len OVS_UNUSED)
+{
+}
+#endif /* DPDK_NETDEV */
+
 static inline bool
 dp_packet_ip_checksum_valid(const struct dp_packet *p)
 {
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 44ebf96..bf5fa63 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -44,6 +44,7 @@ 
 #include <rte_pci.h>
 #include <rte_version.h>
 #include <rte_vhost.h>
+#include <rte_ip.h>
 
 #include "cmap.h"
 #include "coverage.h"
@@ -87,6 +88,7 @@  COVERAGE_DEFINE(vhost_notification);
 
 #define OVS_CACHE_LINE_SIZE CACHE_LINE_SIZE
 #define OVS_VPORT_DPDK "ovs_dpdk"
+#define DPDK_RTE_HDR_OFFSET 1
 
 /*
  * need to reserve tons of extra space in the mbufs so we can align the
@@ -405,6 +407,7 @@  enum dpdk_hw_ol_features {
     NETDEV_RX_HW_SCATTER = 1 << 2,
     NETDEV_TX_TSO_OFFLOAD = 1 << 3,
     NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
+    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
 };
 
 /*
@@ -988,6 +991,12 @@  dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
 
     if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
         conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
+        /* Enable VXLAN TSO support if available */
+        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
+        }
         if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
             conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
         }
@@ -1126,6 +1135,10 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
         if ((info.tx_offload_capa & tx_tso_offload_capa)
             == tx_tso_offload_capa) {
             dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
+            /* Enable VXLAN TSO support if available */
+            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
+                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+            }
             if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
                 dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
             } else {
@@ -2137,29 +2150,96 @@  static bool
 netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
 {
     struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
+    uint16_t l4_proto = 0;
+    struct rte_ether_hdr *eth_hdr =
+        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
+    struct rte_ipv4_hdr *ip_hdr;
+    struct rte_ipv6_hdr *ip6_hdr;
+
+    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
+        /* Handle VXLAN TSO */
+        struct rte_udp_hdr *udp_hdr;
+
+        if (mbuf->ol_flags & PKT_TX_IPV4) {
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
+            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
+
+            /* outer IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
+            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
+
+            ip_hdr = (struct rte_ipv4_hdr *)
+                ((uint8_t *)udp_hdr + mbuf->l2_len);
+            l4_proto = ip_hdr->next_proto_id;
+
+            /* inner IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
+            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + DPDK_RTE_HDR_OFFSET);
+
+            /* outer IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
+            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
+
+            ip6_hdr = (struct rte_ipv6_hdr *)
+                ((uint8_t *)udp_hdr + mbuf->l2_len);
+            l4_proto = ip6_hdr->proto;
+
+            /* inner IP checksum offload offload */
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        }
+    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
+        /* Handle VLAN TSO */
+            /* no inner IP checksum for IPV6 */
+        if (mbuf->ol_flags & PKT_TX_IPV4) {
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
+            l4_proto = ip_hdr->next_proto_id;
+
+            /* IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
+            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
+            l4_proto = ip6_hdr->proto;
+
+            /* IP checksum offload */
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        }
 
-    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
         mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
         mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
         mbuf->outer_l2_len = 0;
         mbuf->outer_l3_len = 0;
     }
 
-    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
-        struct tcp_header *th = dp_packet_l4(pkt);
-
-        if (!th) {
+    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) {
+        if (l4_proto != IPPROTO_UDP) {
+            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
+                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
+            return false;
+        }
+        /* VXLAN GSO can be done here */
+    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
+               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
+        if (l4_proto != IPPROTO_TCP) {
             VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
                          " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
             return false;
         }
 
-        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
-        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
-        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
+        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
+            dp_packet_hwol_set_tcp_seg(pkt);
+        }
 
-        if (mbuf->ol_flags & PKT_TX_IPV4) {
-            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
+        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
+            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
+        } else {
+            mbuf->tso_segsz = 0;
         }
     }
     return true;
@@ -2365,6 +2445,71 @@  netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
     }
 }
 
+static void
+netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
+{
+    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
+    struct rte_ether_hdr *eth_hdr =
+                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
+    ovs_be16 eth_type;
+    int l2_len;
+    int l3_len = 0;
+    int l4_len = 0;
+
+    l2_len = ETH_HEADER_LEN;
+    eth_type = (OVS_FORCE ovs_be16) eth_hdr->ether_type;
+    if (eth_type_vlan(eth_type)) {
+        struct rte_vlan_hdr *vlan_hdr =
+                        (struct rte_vlan_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET);
+
+        eth_type = (OVS_FORCE ovs_be16) vlan_hdr->eth_proto;
+        l2_len += VLAN_HEADER_LEN;
+    }
+
+    dp_packet_hwol_set_l2_len(pkt, l2_len);
+
+    if (eth_type == htons(ETH_TYPE_IP)) {
+        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
+            ((char *)eth_hdr + l2_len);
+
+        l3_len = IP_HEADER_LEN;
+        dp_packet_hwol_set_tx_ipv4(pkt);
+        *l4_proto = ipv4_hdr->next_proto_id;
+    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
+        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
+            ((char *)eth_hdr + l2_len);
+        l3_len = IPV6_HEADER_LEN;
+        dp_packet_hwol_set_tx_ipv6(pkt);
+        *l4_proto = ipv6_hdr->proto;
+    }
+
+    dp_packet_hwol_set_l3_len(pkt, l3_len);
+
+    if (*l4_proto == IPPROTO_TCP) {
+        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
+            ((char *)eth_hdr + l2_len + l3_len);
+
+        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+        dp_packet_hwol_set_l4_len(pkt, l4_len);
+    }
+}
+
+static void
+netdev_dpdk_parse_hdr(struct dp_packet *b)
+{
+    uint16_t l4_proto = 0;
+
+    netdev_linux_parse_l2(b, &l4_proto);
+
+    if (l4_proto == IPPROTO_TCP) {
+        dp_packet_hwol_set_csum_tcp(b);
+    } else if (l4_proto == IPPROTO_UDP) {
+        dp_packet_hwol_set_csum_udp(b);
+    } else if (l4_proto == IPPROTO_SCTP) {
+        dp_packet_hwol_set_csum_sctp(b);
+    }
+}
+
 /*
  * The receive path for the vhost port is the TX path out from guest.
  */
@@ -2378,6 +2523,7 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
     uint16_t qos_drops = 0;
     int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
     int vid = netdev_dpdk_get_vid(dev);
+    struct dp_packet *packet;
 
     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
                      || !(dev->flags & NETDEV_UP))) {
@@ -2417,6 +2563,14 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
     batch->count = nb_rx;
     dp_packet_batch_init_packet_fields(batch);
 
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
+
+        /* Clear ol_flags and set it by parsing header */
+        mbuf->ol_flags = 0;
+        netdev_dpdk_parse_hdr(packet);
+    }
+
     return 0;
 }
 
@@ -2737,13 +2891,18 @@  dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
 
     mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
     mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
-    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
-                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
+    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
+    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
+    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
+    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
+    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
+    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
 
     memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
            sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
 
-    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
+    if ((mbuf_dest->outer_l2_len == 0) &&
+        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
         mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
                                 - (char *)dp_packet_eth(pkt_dest);
         mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
@@ -2773,6 +2932,7 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
     uint32_t tx_failure = 0;
     uint32_t mtu_drops = 0;
     uint32_t qos_drops = 0;
+    struct rte_mbuf *mbuf;
 
     if (dev->type != DPDK_DEV_VHOST) {
         /* Check if QoS has been configured for this netdev. */
@@ -2801,6 +2961,9 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
             break;
         }
 
+        mbuf = (struct rte_mbuf *)pkts[txcnt];
+        netdev_dpdk_prep_hwol_packet(dev, mbuf);
+
         txcnt++;
     }
 
@@ -4949,6 +5112,10 @@  netdev_dpdk_reconfigure(struct netdev *netdev)
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
+        /* Enable VXLAN TSO support if available */
+        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+        }
         if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
             netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
         }
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index 6269c24..f6e80fc 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -6500,6 +6500,8 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
     struct eth_header *eth_hdr;
     ovs_be16 eth_type;
     int l2_len;
+    int l3_len = 0;
+    int l4_len = 0;
 
     eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN);
     if (!eth_hdr) {
@@ -6519,6 +6521,8 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
         l2_len += VLAN_HEADER_LEN;
     }
 
+    dp_packet_hwol_set_l2_len(b, l2_len);
+
     if (eth_type == htons(ETH_TYPE_IP)) {
         struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN);
 
@@ -6526,6 +6530,7 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
             return -EINVAL;
         }
 
+        l3_len = IP_HEADER_LEN;
         *l4proto = ip_hdr->ip_proto;
         dp_packet_hwol_set_tx_ipv4(b);
     } else if (eth_type == htons(ETH_TYPE_IPV6)) {
@@ -6536,10 +6541,25 @@  netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto)
             return -EINVAL;
         }
 
+        l3_len = IPV6_HEADER_LEN;
         *l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt;
         dp_packet_hwol_set_tx_ipv6(b);
     }
 
+    dp_packet_hwol_set_l3_len(b, l3_len);
+
+    if (*l4proto == IPPROTO_TCP) {
+        struct tcp_header *tcp_hdr =  dp_packet_at(b, l2_len + l3_len,
+                                          sizeof(struct tcp_header));
+
+        if (!tcp_hdr) {
+            return -EINVAL;
+        }
+
+        l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4;
+        dp_packet_hwol_set_l4_len(b, l4_len);
+    }
+
     return 0;
 }
 
diff --git a/lib/netdev.c b/lib/netdev.c
index 90962ee..b437caf 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -960,15 +960,21 @@  netdev_push_header(const struct netdev *netdev,
     size_t i, size = dp_packet_batch_size(batch);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
-                         || dp_packet_hwol_l4_mask(packet))) {
+        if (OVS_UNLIKELY((dp_packet_hwol_is_tso(packet)
+                          || dp_packet_hwol_l4_mask(packet))
+                         && (data->tnl_type != OVS_VPORT_TYPE_VXLAN))) {
             COVERAGE_INC(netdev_push_header_drops);
             dp_packet_delete(packet);
-            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
-                         "not supported: packet dropped",
+            VLOG_WARN_RL(&rl,
+                         "%s: non-VxLAN Tunneling packets with HW offload "
+                         "flags is not supported: packet dropped",
                          netdev_get_name(netdev));
         } else {
             netdev->netdev_class->push_header(netdev, packet, data);
+            if ((data->tnl_type == OVS_VPORT_TYPE_VXLAN)
+                && dp_packet_hwol_is_tso(packet)) {
+                dp_packet_hwol_set_vxlan_tcp_seg(packet);
+            }
             pkt_metadata_init(&packet->md, data->out_port);
             dp_packet_batch_refill(batch, packet, i);
         }