diff mbox series

[ovs-dev,v1] Enable VXLAN TSO for DPDK datapath

Message ID 20200601101557.85072-1-yang_y_yi@163.com
State Superseded
Headers show
Series [ovs-dev,v1] Enable VXLAN TSO for DPDK datapath | expand

Commit Message

yang_y_yi June 1, 2020, 10:15 a.m. UTC
From: Yi Yang <yangyi01@inspur.com>

Many NICs can support VXLAN TSO which can help
improve across-compute-node VM-to-VM performance
in case that MTU is set to 1500.

This patch allows dpdkvhostuserclient interface
and veth/tap interface to leverage NICs' offload
capability to maximize across-compute-node TCP
performance, with it applied, OVS DPDK can reach
linespeed for across-compute-node VM-to-VM TCP
performance.

GSO (for UDP and software VXLAN TSO ) and GRO will
be added in the near future.

Signed-off-by: Yi Yang <yangyi01@inspur.com>
---
 lib/dp-packet.h   |  58 ++++++++++++++++
 lib/netdev-dpdk.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 lib/netdev.c      |  16 ++---
 3 files changed, 243 insertions(+), 24 deletions(-)

Comments

0-day Robot June 1, 2020, 10:28 p.m. UTC | #1
Bleep bloop.  Greetings yang_y_yi, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
ERROR: Inappropriate bracing around statement
ERROR: C99 style comment
#215 FILE: lib/netdev-dpdk.c:2219:
    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { //UDP

ERROR: C99 style comment
#241 FILE: lib/netdev-dpdk.c:2240:
            //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;

Lines checked: 424, Warnings: 0, Errors: 3


build:
 dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
                                             ^
lib/dp-packet.h:1076:52: error: unused parameter 'l2_len' [-Werror=unused-parameter]
 dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
                                                    ^
lib/dp-packet.h: In function 'dp_packet_hwol_set_l3_len':
lib/dp-packet.h:1082:45: error: unused parameter 'b' [-Werror=unused-parameter]
 dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
                                             ^
lib/dp-packet.h:1082:52: error: unused parameter 'l3_len' [-Werror=unused-parameter]
 dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
                                                    ^
lib/dp-packet.h: In function 'dp_packet_hwol_set_l4_len':
lib/dp-packet.h:1088:45: error: unused parameter 'b' [-Werror=unused-parameter]
 dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
                                             ^
lib/dp-packet.h:1088:52: error: unused parameter 'l4_len' [-Werror=unused-parameter]
 dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
                                                    ^
cc1: all warnings being treated as errors
make[2]: *** [lib/bfd.lo] Error 1
make[2]: Leaving directory `/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace'
make[1]: *** [all-recursive] Error 1
make[1]: Leaving directory `/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace'
make: *** [all] Error 2


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
Stokes, Ian June 3, 2020, 12:43 p.m. UTC | #2
On 6/1/2020 11:15 AM, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> Many NICs can support VXLAN TSO which can help
> improve across-compute-node VM-to-VM performance
> in case that MTU is set to 1500.
> 
> This patch allows dpdkvhostuserclient interface
> and veth/tap interface to leverage NICs' offload
> capability to maximize across-compute-node TCP
> performance, with it applied, OVS DPDK can reach
> linespeed for across-compute-node VM-to-VM TCP
> performance.
> 
> GSO (for UDP and software VXLAN TSO ) and GRO will
> be added in the near future.

Hi Yi, thanks for working on this, I have not tested yet as there were 
multiple compilation issues to be addressed, I've flagged these below.


If you can address these and meanwhile I'll look to deploy a setup in 
our lab to enable testing/review in greater detail.

> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>   lib/dp-packet.h   |  58 ++++++++++++++++
>   lib/netdev-dpdk.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>   lib/netdev.c      |  16 ++---
>   3 files changed, 243 insertions(+), 24 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 0430cca..7faa675 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -1032,6 +1032,64 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>       *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>   }
>   
> +#ifdef DPDK_NETDEV
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +    b->mbuf.l2_len += sizeof(struct udp_header) +
> +                      sizeof(struct vxlanhdr);
> +    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
> +    b->mbuf.outer_l3_len = IP_HEADER_LEN;
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +#else

For non DPDK case, need to add OVS_UNUSED after each function parameter 
below, otherwise it will be treated as unused parameter and cause 
compilation failure for non DPDK case.

> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +}
> +#endif /* DPDK_NETDEV */
> +
>   static inline bool
>   dp_packet_ip_checksum_valid(const struct dp_packet *p)
>   {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 44ebf96..c2424f7 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -44,6 +44,7 @@
>   #include <rte_pci.h>
>   #include <rte_version.h>
>   #include <rte_vhost.h>
> +#include <rte_ip.h>
>   
>   #include "cmap.h"
>   #include "coverage.h"
> @@ -405,6 +406,7 @@ enum dpdk_hw_ol_features {
>       NETDEV_RX_HW_SCATTER = 1 << 2,
>       NETDEV_TX_TSO_OFFLOAD = 1 << 3,
>       NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
>   };
>   
>   /*
> @@ -988,6 +990,12 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>   
>       if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>           conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
> +        }
>           if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>               conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
>           }
> @@ -1126,6 +1134,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>           if ((info.tx_offload_capa & tx_tso_offload_capa)
>               == tx_tso_offload_capa) {
>               dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> +            /* Enable VXLAN TSO support if available */
> +            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +            }
>               if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
>                   dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
>               } else {
> @@ -2137,29 +2149,97 @@ static bool
>   netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>   {
>       struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> +    uint16_t l4_proto = 0;
> +    struct rte_ether_hdr *eth_hdr =
> +        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    struct rte_ipv4_hdr *ip_hdr;
> +    struct rte_ipv6_hdr *ip6_hdr;
> +
> +    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +        /* Handle VXLAN TSO */
> +        struct rte_udp_hdr *udp_hdr;
> +
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
We try to avoid use of magic numbers where possible, might be worth 
defining the + 1 separately for where it's used below and elsewhere.

> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip_hdr = (struct rte_ipv4_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* inner IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip6_hdr = (struct rte_ipv6_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* inner IP checksum offload offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
> +    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +        /* Handle VLAN TSO */
> +            /* no inner IP checksum for IPV6 */
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* IP checksum offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
>   
> -    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
>           mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
>           mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>           mbuf->outer_l2_len = 0;
>           mbuf->outer_l3_len = 0;
>       }
>   
> -    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> -        struct tcp_header *th = dp_packet_l4(pkt);
> -
> -        if (!th) {
> +    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { //UDP
> +        if (l4_proto != IPPROTO_UDP) {
> +            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> +                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            return false;
> +        }
> +        /* VXLAN GSO can be done here */
> +    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
> +               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> +        if (l4_proto != IPPROTO_TCP) {
>               VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
>                            " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>               return false;
>           }
>   
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> -        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +            dp_packet_hwol_set_tcp_seg(pkt);
> +        }
>   
> -        if (mbuf->ol_flags & PKT_TX_IPV4) {
> -            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> +        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> +            //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
> +        } else {
> +            mbuf->tso_segsz = 0;
>           }
>       }
>       return true;
> @@ -2365,6 +2445,71 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
>       }
>   }
>   
> +static void
> +netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
> +{
> +    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
> +    struct rte_ether_hdr *eth_hdr =
> +                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    ovs_be16 eth_type;
> +    int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;
> +
> +    l2_len = ETH_HEADER_LEN;
Will cause compilation issue below, mixing base types, will expect 
restricted ovs_be16 but will be assigned unsigned short.

> +    eth_type = eth_hdr->ether_type;
> +    if (eth_type_vlan(eth_type)) {
> +        struct rte_vlan_hdr *vlan_hdr =
> +                        (struct rte_vlan_hdr *)(eth_hdr + 1);
> +
Same issue as flagged above, unexpected type.
> +        eth_type = vlan_hdr->eth_proto;
> +        l2_len += VLAN_HEADER_LEN;
> +    }
> +
> +    dp_packet_hwol_set_l2_len(pkt, l2_len);
> +
> +    if (eth_type == htons(ETH_TYPE_IP)) {
> +        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +
> +        l3_len = IP_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv4(pkt);
> +        *l4_proto = ipv4_hdr->next_proto_id;
> +    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
> +        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +        l3_len = IPV6_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv6(pkt);
> +        *l4_proto = ipv6_hdr->proto;
> +    }
> +
> +    dp_packet_hwol_set_l3_len(pkt, l3_len);
> +
> +    if (*l4_proto == IPPROTO_TCP) {
> +        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
> +            ((char *)eth_hdr + l2_len + l3_len);
> +
> +        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> +        dp_packet_hwol_set_l4_len(pkt, l4_len);
> +    }
> +}
> +
> +static void
> +netdev_dpdk_parse_hdr(struct dp_packet *b)
> +{
> +    uint16_t l4_proto = 0;
> +
> +    netdev_linux_parse_l2(b, &l4_proto);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        dp_packet_hwol_set_csum_tcp(b);
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        dp_packet_hwol_set_csum_udp(b);
> +    } else if (l4_proto == IPPROTO_SCTP) {
> +        dp_packet_hwol_set_csum_sctp(b);
> +    }
> +}
> +
>   /*
>    * The receive path for the vhost port is the TX path out from guest.
>    */
> @@ -2378,6 +2523,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       uint16_t qos_drops = 0;
>       int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
>       int vid = netdev_dpdk_get_vid(dev);
> +    struct dp_packet *packet;
>   
>       if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>                        || !(dev->flags & NETDEV_UP))) {
> @@ -2417,6 +2563,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       batch->count = nb_rx;
>       dp_packet_batch_init_packet_fields(batch);
>   
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
> +
> +        /* Clear ol_flags and set it by parsing header */
> +        mbuf->ol_flags = 0;
> +        netdev_dpdk_parse_hdr(packet);
> +    }
> +
>       return 0;
>   }
>   
> @@ -2737,13 +2891,18 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>   
>       mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
>       mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> -    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> -                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> +    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
> +    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
> +    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
> +    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
> +    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
> +    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
>   
>       memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>              sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>   
> -    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> +    if ((mbuf_dest->outer_l2_len == 0) &&
> +        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
>           mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
>                                   - (char *)dp_packet_eth(pkt_dest);
>           mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> @@ -2773,6 +2932,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>       uint32_t tx_failure = 0;
>       uint32_t mtu_drops = 0;
>       uint32_t qos_drops = 0;
> +    struct rte_mbuf *mbuf;
>   
>       if (dev->type != DPDK_DEV_VHOST) {
>           /* Check if QoS has been configured for this netdev. */
> @@ -2801,6 +2961,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>               break;
>           }
>   
> +        mbuf = (struct rte_mbuf *)pkts[txcnt];
> +        netdev_dpdk_prep_hwol_packet(dev, mbuf);
> +
>           txcnt++;
>       }
>   
> @@ -4949,6 +5112,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        }
>           if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>               netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
>           }
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 90962ee..378f04f 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -960,18 +960,12 @@ netdev_push_header(const struct netdev *netdev,
>       size_t i, size = dp_packet_batch_size(batch);
>   
>       DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> -        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
> -                         || dp_packet_hwol_l4_mask(packet))) {
Removing the line below will break coverage and cause compilation 
failure. If coverage for the function is unused then you need need to 
either remove the existing COVERAGE_DEFINE(netdev_push_header_drops); or 
else define a new one for use later in the function.
> -            COVERAGE_INC(netdev_push_header_drops);
> -            dp_packet_delete(packet);
> -            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
> -                         "not supported: packet dropped",
> -                         netdev_get_name(netdev));
> -        } else {
> -            netdev->netdev_class->push_header(netdev, packet, data);
> -            pkt_metadata_init(&packet->md, data->out_port);
> -            dp_packet_batch_refill(batch, packet, i);
> +        netdev->netdev_class->push_header(netdev, packet, data);
> +        if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
> +            dp_packet_hwol_set_vxlan_tcp_seg(packet);
>           }
> +        pkt_metadata_init(&packet->md, data->out_port);
> +        dp_packet_batch_refill(batch, packet, i);
>       }
>   
>       return 0;
> 

It would be worth to update the documentation at some stage in the 
future as part of the patch as the limitations directly calls out VXLAN 
not being supported.

http://docs.openvswitch.org/en/latest/topics/userspace-tso/#limitations

BR
Ian
Yi Yang (杨燚)-云服务集团 June 4, 2020, 12:29 a.m. UTC | #3
Ian, thank you so much for comments, I will run travis  to find out all  the similar issues and fix them in v2. For setup, I have sent you an email which included dpdk bug link, please use that to setup it, please let me know if you have any issue.

-----邮件原件-----
发件人: Stokes, Ian [mailto:ian.stokes@intel.com] 
发送时间: 2020年6月3日 20:43
收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
抄送: blp@ovn.org; u9012063@gmail.com; Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com>
主题: Re: [PATCH v1] Enable VXLAN TSO for DPDK datapath



On 6/1/2020 11:15 AM, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> Many NICs can support VXLAN TSO which can help improve 
> across-compute-node VM-to-VM performance in case that MTU is set to 
> 1500.
> 
> This patch allows dpdkvhostuserclient interface and veth/tap interface 
> to leverage NICs' offload capability to maximize across-compute-node 
> TCP performance, with it applied, OVS DPDK can reach linespeed for 
> across-compute-node VM-to-VM TCP performance.
> 
> GSO (for UDP and software VXLAN TSO ) and GRO will be added in the 
> near future.

Hi Yi, thanks for working on this, I have not tested yet as there were multiple compilation issues to be addressed, I've flagged these below.


If you can address these and meanwhile I'll look to deploy a setup in 
our lab to enable testing/review in greater detail.

> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>   lib/dp-packet.h   |  58 ++++++++++++++++
>   lib/netdev-dpdk.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>   lib/netdev.c      |  16 ++---
>   3 files changed, 243 insertions(+), 24 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 0430cca..7faa675 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -1032,6 +1032,64 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>       *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>   }
>   
> +#ifdef DPDK_NETDEV
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +    b->mbuf.l2_len += sizeof(struct udp_header) +
> +                      sizeof(struct vxlanhdr);
> +    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
> +    b->mbuf.outer_l3_len = IP_HEADER_LEN;
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +#else

For non DPDK case, need to add OVS_UNUSED after each function parameter 
below, otherwise it will be treated as unused parameter and cause 
compilation failure for non DPDK case.

> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +}
> +#endif /* DPDK_NETDEV */
> +
>   static inline bool
>   dp_packet_ip_checksum_valid(const struct dp_packet *p)
>   {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 44ebf96..c2424f7 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -44,6 +44,7 @@
>   #include <rte_pci.h>
>   #include <rte_version.h>
>   #include <rte_vhost.h>
> +#include <rte_ip.h>
>   
>   #include "cmap.h"
>   #include "coverage.h"
> @@ -405,6 +406,7 @@ enum dpdk_hw_ol_features {
>       NETDEV_RX_HW_SCATTER = 1 << 2,
>       NETDEV_TX_TSO_OFFLOAD = 1 << 3,
>       NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
>   };
>   
>   /*
> @@ -988,6 +990,12 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>   
>       if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>           conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
> +        }
>           if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>               conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
>           }
> @@ -1126,6 +1134,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>           if ((info.tx_offload_capa & tx_tso_offload_capa)
>               == tx_tso_offload_capa) {
>               dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> +            /* Enable VXLAN TSO support if available */
> +            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +            }
>               if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
>                   dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
>               } else {
> @@ -2137,29 +2149,97 @@ static bool
>   netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>   {
>       struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> +    uint16_t l4_proto = 0;
> +    struct rte_ether_hdr *eth_hdr =
> +        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    struct rte_ipv4_hdr *ip_hdr;
> +    struct rte_ipv6_hdr *ip6_hdr;
> +
> +    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +        /* Handle VXLAN TSO */
> +        struct rte_udp_hdr *udp_hdr;
> +
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
We try to avoid use of magic numbers where possible, might be worth 
defining the + 1 separately for where it's used below and elsewhere.

> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip_hdr = (struct rte_ipv4_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* inner IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip6_hdr = (struct rte_ipv6_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* inner IP checksum offload offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
> +    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +        /* Handle VLAN TSO */
> +            /* no inner IP checksum for IPV6 */
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* IP checksum offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
>   
> -    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
>           mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
>           mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>           mbuf->outer_l2_len = 0;
>           mbuf->outer_l3_len = 0;
>       }
>   
> -    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> -        struct tcp_header *th = dp_packet_l4(pkt);
> -
> -        if (!th) {
> +    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { //UDP
> +        if (l4_proto != IPPROTO_UDP) {
> +            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> +                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            return false;
> +        }
> +        /* VXLAN GSO can be done here */
> +    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
> +               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> +        if (l4_proto != IPPROTO_TCP) {
>               VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
>                            " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>               return false;
>           }
>   
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> -        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +            dp_packet_hwol_set_tcp_seg(pkt);
> +        }
>   
> -        if (mbuf->ol_flags & PKT_TX_IPV4) {
> -            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> +        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> +            //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
> +        } else {
> +            mbuf->tso_segsz = 0;
>           }
>       }
>       return true;
> @@ -2365,6 +2445,71 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
>       }
>   }
>   
> +static void
> +netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
> +{
> +    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
> +    struct rte_ether_hdr *eth_hdr =
> +                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    ovs_be16 eth_type;
> +    int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;
> +
> +    l2_len = ETH_HEADER_LEN;
Will cause compilation issue below, mixing base types, will expect 
restricted ovs_be16 but will be assigned unsigned short.

> +    eth_type = eth_hdr->ether_type;
> +    if (eth_type_vlan(eth_type)) {
> +        struct rte_vlan_hdr *vlan_hdr =
> +                        (struct rte_vlan_hdr *)(eth_hdr + 1);
> +
Same issue as flagged above, unexpected type.
> +        eth_type = vlan_hdr->eth_proto;
> +        l2_len += VLAN_HEADER_LEN;
> +    }
> +
> +    dp_packet_hwol_set_l2_len(pkt, l2_len);
> +
> +    if (eth_type == htons(ETH_TYPE_IP)) {
> +        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +
> +        l3_len = IP_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv4(pkt);
> +        *l4_proto = ipv4_hdr->next_proto_id;
> +    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
> +        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +        l3_len = IPV6_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv6(pkt);
> +        *l4_proto = ipv6_hdr->proto;
> +    }
> +
> +    dp_packet_hwol_set_l3_len(pkt, l3_len);
> +
> +    if (*l4_proto == IPPROTO_TCP) {
> +        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
> +            ((char *)eth_hdr + l2_len + l3_len);
> +
> +        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> +        dp_packet_hwol_set_l4_len(pkt, l4_len);
> +    }
> +}
> +
> +static void
> +netdev_dpdk_parse_hdr(struct dp_packet *b)
> +{
> +    uint16_t l4_proto = 0;
> +
> +    netdev_linux_parse_l2(b, &l4_proto);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        dp_packet_hwol_set_csum_tcp(b);
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        dp_packet_hwol_set_csum_udp(b);
> +    } else if (l4_proto == IPPROTO_SCTP) {
> +        dp_packet_hwol_set_csum_sctp(b);
> +    }
> +}
> +
>   /*
>    * The receive path for the vhost port is the TX path out from guest.
>    */
> @@ -2378,6 +2523,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       uint16_t qos_drops = 0;
>       int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
>       int vid = netdev_dpdk_get_vid(dev);
> +    struct dp_packet *packet;
>   
>       if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>                        || !(dev->flags & NETDEV_UP))) {
> @@ -2417,6 +2563,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       batch->count = nb_rx;
>       dp_packet_batch_init_packet_fields(batch);
>   
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
> +
> +        /* Clear ol_flags and set it by parsing header */
> +        mbuf->ol_flags = 0;
> +        netdev_dpdk_parse_hdr(packet);
> +    }
> +
>       return 0;
>   }
>   
> @@ -2737,13 +2891,18 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>   
>       mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
>       mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> -    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> -                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> +    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
> +    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
> +    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
> +    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
> +    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
> +    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
>   
>       memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>              sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>   
> -    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> +    if ((mbuf_dest->outer_l2_len == 0) &&
> +        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
>           mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
>                                   - (char *)dp_packet_eth(pkt_dest);
>           mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> @@ -2773,6 +2932,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>       uint32_t tx_failure = 0;
>       uint32_t mtu_drops = 0;
>       uint32_t qos_drops = 0;
> +    struct rte_mbuf *mbuf;
>   
>       if (dev->type != DPDK_DEV_VHOST) {
>           /* Check if QoS has been configured for this netdev. */
> @@ -2801,6 +2961,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>               break;
>           }
>   
> +        mbuf = (struct rte_mbuf *)pkts[txcnt];
> +        netdev_dpdk_prep_hwol_packet(dev, mbuf);
> +
>           txcnt++;
>       }
>   
> @@ -4949,6 +5112,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        }
>           if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>               netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
>           }
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 90962ee..378f04f 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -960,18 +960,12 @@ netdev_push_header(const struct netdev *netdev,
>       size_t i, size = dp_packet_batch_size(batch);
>   
>       DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> -        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
> -                         || dp_packet_hwol_l4_mask(packet))) {
Removing the line below will break coverage and cause compilation 
failure. If coverage for the function is unused then you need need to 
either remove the existing COVERAGE_DEFINE(netdev_push_header_drops); or 
else define a new one for use later in the function.
> -            COVERAGE_INC(netdev_push_header_drops);
> -            dp_packet_delete(packet);
> -            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
> -                         "not supported: packet dropped",
> -                         netdev_get_name(netdev));
> -        } else {
> -            netdev->netdev_class->push_header(netdev, packet, data);
> -            pkt_metadata_init(&packet->md, data->out_port);
> -            dp_packet_batch_refill(batch, packet, i);
> +        netdev->netdev_class->push_header(netdev, packet, data);
> +        if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
> +            dp_packet_hwol_set_vxlan_tcp_seg(packet);
>           }
> +        pkt_metadata_init(&packet->md, data->out_port);
> +        dp_packet_batch_refill(batch, packet, i);
>       }
>   
>       return 0;
> 

It would be worth to update the documentation at some stage in the 
future as part of the patch as the limitations directly calls out VXLAN 
not being supported.

http://docs.openvswitch.org/en/latest/topics/userspace-tso/#limitations

BR
Ian
Yi Yang (杨燚)-云服务集团 June 5, 2020, 5:36 a.m. UTC | #4
Hi, Ian

I have fixed issues your comments mentioned and passed travis build check, it is in my github, https://github.com/yyang13/ovs, v1 sent out last week missed a change in lib/netdev-linux.c, please use this one patch in https://github.com/yyang13/ovs for your setup.

-----邮件原件-----
发件人: Yi Yang (杨燚)-云服务集团 
发送时间: 2020年6月4日 8:30
收件人: 'ian.stokes@intel.com' <ian.stokes@intel.com>; 'yang_y_yi@163.com' <yang_y_yi@163.com>; 'ovs-dev@openvswitch.org' <ovs-dev@openvswitch.org>
抄送: 'blp@ovn.org' <blp@ovn.org>; 'u9012063@gmail.com' <u9012063@gmail.com>
主题: 答复: [PATCH v1] Enable VXLAN TSO for DPDK datapath
重要性: 高

Ian, thank you so much for comments, I will run travis  to find out all  the similar issues and fix them in v2. For setup, I have sent you an email which included dpdk bug link, please use that to setup it, please let me know if you have any issue.

-----邮件原件-----
发件人: Stokes, Ian [mailto:ian.stokes@intel.com] 
发送时间: 2020年6月3日 20:43
收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org
抄送: blp@ovn.org; u9012063@gmail.com; Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com>
主题: Re: [PATCH v1] Enable VXLAN TSO for DPDK datapath



On 6/1/2020 11:15 AM, yang_y_yi@163.com wrote:
> From: Yi Yang <yangyi01@inspur.com>
> 
> Many NICs can support VXLAN TSO which can help improve 
> across-compute-node VM-to-VM performance in case that MTU is set to 
> 1500.
> 
> This patch allows dpdkvhostuserclient interface and veth/tap interface 
> to leverage NICs' offload capability to maximize across-compute-node 
> TCP performance, with it applied, OVS DPDK can reach linespeed for 
> across-compute-node VM-to-VM TCP performance.
> 
> GSO (for UDP and software VXLAN TSO ) and GRO will be added in the 
> near future.

Hi Yi, thanks for working on this, I have not tested yet as there were multiple compilation issues to be addressed, I've flagged these below.


If you can address these and meanwhile I'll look to deploy a setup in 
our lab to enable testing/review in greater detail.

> 
> Signed-off-by: Yi Yang <yangyi01@inspur.com>
> ---
>   lib/dp-packet.h   |  58 ++++++++++++++++
>   lib/netdev-dpdk.c | 193 ++++++++++++++++++++++++++++++++++++++++++++++++++----
>   lib/netdev.c      |  16 ++---
>   3 files changed, 243 insertions(+), 24 deletions(-)
> 
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 0430cca..7faa675 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -1032,6 +1032,64 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>       *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>   }
>   
> +#ifdef DPDK_NETDEV
> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
> +    b->mbuf.l2_len += sizeof(struct udp_header) +
> +                      sizeof(struct vxlanhdr);
> +    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
> +    b->mbuf.outer_l3_len = IP_HEADER_LEN;
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +#else

For non DPDK case, need to add OVS_UNUSED after each function parameter 
below, otherwise it will be treated as unused parameter and cause 
compilation failure for non DPDK case.

> +/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
> +static inline void
> +dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
> +{
> +}
> +
> +/* Set l2_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
> +{
> +}
> +
> +/* Set l3_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
> +{
> +}
> +
> +/* Set l4_len for the packet 'b' */
> +static inline void
> +dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
> +{
> +}
> +#endif /* DPDK_NETDEV */
> +
>   static inline bool
>   dp_packet_ip_checksum_valid(const struct dp_packet *p)
>   {
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 44ebf96..c2424f7 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -44,6 +44,7 @@
>   #include <rte_pci.h>
>   #include <rte_version.h>
>   #include <rte_vhost.h>
> +#include <rte_ip.h>
>   
>   #include "cmap.h"
>   #include "coverage.h"
> @@ -405,6 +406,7 @@ enum dpdk_hw_ol_features {
>       NETDEV_RX_HW_SCATTER = 1 << 2,
>       NETDEV_TX_TSO_OFFLOAD = 1 << 3,
>       NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
>   };
>   
>   /*
> @@ -988,6 +990,12 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>   
>       if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>           conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
> +        }
>           if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>               conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
>           }
> @@ -1126,6 +1134,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>           if ((info.tx_offload_capa & tx_tso_offload_capa)
>               == tx_tso_offload_capa) {
>               dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
> +            /* Enable VXLAN TSO support if available */
> +            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +            }
>               if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
>                   dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
>               } else {
> @@ -2137,29 +2149,97 @@ static bool
>   netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>   {
>       struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
> +    uint16_t l4_proto = 0;
> +    struct rte_ether_hdr *eth_hdr =
> +        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    struct rte_ipv4_hdr *ip_hdr;
> +    struct rte_ipv6_hdr *ip6_hdr;
> +
> +    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
> +        /* Handle VXLAN TSO */
> +        struct rte_udp_hdr *udp_hdr;
> +
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
We try to avoid use of magic numbers where possible, might be worth 
defining the + 1 separately for where it's used below and elsewhere.

> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip_hdr = (struct rte_ipv4_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* inner IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
> +
> +            /* outer IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
> +            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
> +
> +            ip6_hdr = (struct rte_ipv6_hdr *)
> +                ((uint8_t *)udp_hdr + mbuf->l2_len);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* inner IP checksum offload offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
> +    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
> +        /* Handle VLAN TSO */
> +            /* no inner IP checksum for IPV6 */
> +        if (mbuf->ol_flags & PKT_TX_IPV4) {
> +            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
> +            l4_proto = ip_hdr->next_proto_id;
> +
> +            /* IP checksum offload */
> +            ip_hdr->hdr_checksum = 0;
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
> +            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
> +            l4_proto = ip6_hdr->proto;
> +
> +            /* IP checksum offload */
> +            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        }
>   
> -    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
>           mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
>           mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
>           mbuf->outer_l2_len = 0;
>           mbuf->outer_l3_len = 0;
>       }
>   
> -    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> -        struct tcp_header *th = dp_packet_l4(pkt);
> -
> -        if (!th) {
> +    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { //UDP
> +        if (l4_proto != IPPROTO_UDP) {
> +            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
> +                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            return false;
> +        }
> +        /* VXLAN GSO can be done here */
> +    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
> +               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
> +        if (l4_proto != IPPROTO_TCP) {
>               VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
>                            " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
>               return false;
>           }
>   
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> -        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
> +            dp_packet_hwol_set_tcp_seg(pkt);
> +        }
>   
> -        if (mbuf->ol_flags & PKT_TX_IPV4) {
> -            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
> +        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
> +        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
> +            //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
> +        } else {
> +            mbuf->tso_segsz = 0;
>           }
>       }
>       return true;
> @@ -2365,6 +2445,71 @@ netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
>       }
>   }
>   
> +static void
> +netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
> +{
> +    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
> +    struct rte_ether_hdr *eth_hdr =
> +                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
> +    ovs_be16 eth_type;
> +    int l2_len;
> +    int l3_len = 0;
> +    int l4_len = 0;
> +
> +    l2_len = ETH_HEADER_LEN;
Will cause compilation issue below, mixing base types, will expect 
restricted ovs_be16 but will be assigned unsigned short.

> +    eth_type = eth_hdr->ether_type;
> +    if (eth_type_vlan(eth_type)) {
> +        struct rte_vlan_hdr *vlan_hdr =
> +                        (struct rte_vlan_hdr *)(eth_hdr + 1);
> +
Same issue as flagged above, unexpected type.
> +        eth_type = vlan_hdr->eth_proto;
> +        l2_len += VLAN_HEADER_LEN;
> +    }
> +
> +    dp_packet_hwol_set_l2_len(pkt, l2_len);
> +
> +    if (eth_type == htons(ETH_TYPE_IP)) {
> +        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +
> +        l3_len = IP_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv4(pkt);
> +        *l4_proto = ipv4_hdr->next_proto_id;
> +    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
> +        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
> +            ((char *)eth_hdr + l2_len);
> +        l3_len = IPV6_HEADER_LEN;
> +        dp_packet_hwol_set_tx_ipv6(pkt);
> +        *l4_proto = ipv6_hdr->proto;
> +    }
> +
> +    dp_packet_hwol_set_l3_len(pkt, l3_len);
> +
> +    if (*l4_proto == IPPROTO_TCP) {
> +        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
> +            ((char *)eth_hdr + l2_len + l3_len);
> +
> +        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
> +        dp_packet_hwol_set_l4_len(pkt, l4_len);
> +    }
> +}
> +
> +static void
> +netdev_dpdk_parse_hdr(struct dp_packet *b)
> +{
> +    uint16_t l4_proto = 0;
> +
> +    netdev_linux_parse_l2(b, &l4_proto);
> +
> +    if (l4_proto == IPPROTO_TCP) {
> +        dp_packet_hwol_set_csum_tcp(b);
> +    } else if (l4_proto == IPPROTO_UDP) {
> +        dp_packet_hwol_set_csum_udp(b);
> +    } else if (l4_proto == IPPROTO_SCTP) {
> +        dp_packet_hwol_set_csum_sctp(b);
> +    }
> +}
> +
>   /*
>    * The receive path for the vhost port is the TX path out from guest.
>    */
> @@ -2378,6 +2523,7 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       uint16_t qos_drops = 0;
>       int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
>       int vid = netdev_dpdk_get_vid(dev);
> +    struct dp_packet *packet;
>   
>       if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
>                        || !(dev->flags & NETDEV_UP))) {
> @@ -2417,6 +2563,14 @@ netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
>       batch->count = nb_rx;
>       dp_packet_batch_init_packet_fields(batch);
>   
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
> +
> +        /* Clear ol_flags and set it by parsing header */
> +        mbuf->ol_flags = 0;
> +        netdev_dpdk_parse_hdr(packet);
> +    }
> +
>       return 0;
>   }
>   
> @@ -2737,13 +2891,18 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
>   
>       mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
>       mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
> -    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
> -                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
> +    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
> +    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
> +    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
> +    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
> +    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
> +    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
>   
>       memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>              sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
>   
> -    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
> +    if ((mbuf_dest->outer_l2_len == 0) &&
> +        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
>           mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
>                                   - (char *)dp_packet_eth(pkt_dest);
>           mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> @@ -2773,6 +2932,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>       uint32_t tx_failure = 0;
>       uint32_t mtu_drops = 0;
>       uint32_t qos_drops = 0;
> +    struct rte_mbuf *mbuf;
>   
>       if (dev->type != DPDK_DEV_VHOST) {
>           /* Check if QoS has been configured for this netdev. */
> @@ -2801,6 +2961,9 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
>               break;
>           }
>   
> +        mbuf = (struct rte_mbuf *)pkts[txcnt];
> +        netdev_dpdk_prep_hwol_packet(dev, mbuf);
> +
>           txcnt++;
>       }
>   
> @@ -4949,6 +5112,10 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
>           netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +        /* Enable VXLAN TSO support if available */
> +        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        }
>           if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
>               netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
>           }
> diff --git a/lib/netdev.c b/lib/netdev.c
> index 90962ee..378f04f 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -960,18 +960,12 @@ netdev_push_header(const struct netdev *netdev,
>       size_t i, size = dp_packet_batch_size(batch);
>   
>       DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
> -        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
> -                         || dp_packet_hwol_l4_mask(packet))) {
Removing the line below will break coverage and cause compilation 
failure. If coverage for the function is unused then you need need to 
either remove the existing COVERAGE_DEFINE(netdev_push_header_drops); or 
else define a new one for use later in the function.
> -            COVERAGE_INC(netdev_push_header_drops);
> -            dp_packet_delete(packet);
> -            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
> -                         "not supported: packet dropped",
> -                         netdev_get_name(netdev));
> -        } else {
> -            netdev->netdev_class->push_header(netdev, packet, data);
> -            pkt_metadata_init(&packet->md, data->out_port);
> -            dp_packet_batch_refill(batch, packet, i);
> +        netdev->netdev_class->push_header(netdev, packet, data);
> +        if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
> +            dp_packet_hwol_set_vxlan_tcp_seg(packet);
>           }
> +        pkt_metadata_init(&packet->md, data->out_port);
> +        dp_packet_batch_refill(batch, packet, i);
>       }
>   
>       return 0;
> 

It would be worth to update the documentation at some stage in the 
future as part of the patch as the limitations directly calls out VXLAN 
not being supported.

http://docs.openvswitch.org/en/latest/topics/userspace-tso/#limitations

BR
Ian
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 0430cca..7faa675 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -1032,6 +1032,64 @@  dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
     *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
 }
 
+#ifdef DPDK_NETDEV
+/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
+static inline void
+dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
+{
+    b->mbuf.ol_flags |= PKT_TX_TUNNEL_VXLAN;
+    b->mbuf.l2_len += sizeof(struct udp_header) +
+                      sizeof(struct vxlanhdr);
+    b->mbuf.outer_l2_len = ETH_HEADER_LEN;
+    b->mbuf.outer_l3_len = IP_HEADER_LEN;
+}
+
+/* Set l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
+{
+    b->mbuf.l2_len = l2_len;
+}
+
+/* Set l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
+{
+    b->mbuf.l3_len = l3_len;
+}
+
+/* Set l4_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
+{
+    b->mbuf.l4_len = l4_len;
+}
+#else
+/* Mark packet 'b' for VXLAN TCP segmentation offloading. */
+static inline void
+dp_packet_hwol_set_vxlan_tcp_seg(struct dp_packet *b)
+{
+}
+
+/* Set l2_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l2_len(struct dp_packet *b, int l2_len)
+{
+}
+
+/* Set l3_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l3_len(struct dp_packet *b, int l3_len)
+{
+}
+
+/* Set l4_len for the packet 'b' */
+static inline void
+dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len)
+{
+}
+#endif /* DPDK_NETDEV */
+
 static inline bool
 dp_packet_ip_checksum_valid(const struct dp_packet *p)
 {
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 44ebf96..c2424f7 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -44,6 +44,7 @@ 
 #include <rte_pci.h>
 #include <rte_version.h>
 #include <rte_vhost.h>
+#include <rte_ip.h>
 
 #include "cmap.h"
 #include "coverage.h"
@@ -405,6 +406,7 @@  enum dpdk_hw_ol_features {
     NETDEV_RX_HW_SCATTER = 1 << 2,
     NETDEV_TX_TSO_OFFLOAD = 1 << 3,
     NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
+    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 5,
 };
 
 /*
@@ -988,6 +990,12 @@  dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
 
     if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
         conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
+        /* Enable VXLAN TSO support if available */
+        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+            conf.txmode.offloads |= DEV_TX_OFFLOAD_MULTI_SEGS;
+        }
         if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
             conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM;
         }
@@ -1126,6 +1134,10 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
         if ((info.tx_offload_capa & tx_tso_offload_capa)
             == tx_tso_offload_capa) {
             dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD;
+            /* Enable VXLAN TSO support if available */
+            if (info.tx_offload_capa & DEV_TX_OFFLOAD_VXLAN_TNL_TSO) {
+                dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+            }
             if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) {
                 dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD;
             } else {
@@ -2137,29 +2149,97 @@  static bool
 netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
 {
     struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
+    uint16_t l4_proto = 0;
+    struct rte_ether_hdr *eth_hdr =
+        rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
+    struct rte_ipv4_hdr *ip_hdr;
+    struct rte_ipv6_hdr *ip6_hdr;
+
+    if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) {
+        /* Handle VXLAN TSO */
+        struct rte_udp_hdr *udp_hdr;
+
+        if (mbuf->ol_flags & PKT_TX_IPV4) {
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
+
+            /* outer IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
+            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
+
+            ip_hdr = (struct rte_ipv4_hdr *)
+                ((uint8_t *)udp_hdr + mbuf->l2_len);
+            l4_proto = ip_hdr->next_proto_id;
+
+            /* inner IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+            udp_hdr = (struct rte_udp_hdr *)(ip_hdr + 1);
+
+            /* outer IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_OUTER_IP_CKSUM;
+            mbuf->ol_flags |= PKT_TX_OUTER_IPV4;
+
+            ip6_hdr = (struct rte_ipv6_hdr *)
+                ((uint8_t *)udp_hdr + mbuf->l2_len);
+            l4_proto = ip6_hdr->proto;
+
+            /* inner IP checksum offload offload */
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        }
+    } else if (mbuf->ol_flags & PKT_TX_L4_MASK) {
+        /* Handle VLAN TSO */
+            /* no inner IP checksum for IPV6 */
+        if (mbuf->ol_flags & PKT_TX_IPV4) {
+            ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1);
+            l4_proto = ip_hdr->next_proto_id;
+
+            /* IP checksum offload */
+            ip_hdr->hdr_checksum = 0;
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        } else if (mbuf->ol_flags & PKT_TX_IPV6) {
+            ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1);
+            l4_proto = ip6_hdr->proto;
+
+            /* IP checksum offload */
+            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        }
 
-    if (mbuf->ol_flags & PKT_TX_L4_MASK) {
         mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char *)dp_packet_eth(pkt);
         mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
         mbuf->outer_l2_len = 0;
         mbuf->outer_l3_len = 0;
     }
 
-    if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
-        struct tcp_header *th = dp_packet_l4(pkt);
-
-        if (!th) {
+    if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { //UDP
+        if (l4_proto != IPPROTO_UDP) {
+            VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header"
+                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
+            return false;
+        }
+        /* VXLAN GSO can be done here */
+    } else if (mbuf->ol_flags & PKT_TX_TCP_SEG ||
+               mbuf->ol_flags & PKT_TX_TCP_CKSUM) {
+        if (l4_proto != IPPROTO_TCP) {
             VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
                          " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
             return false;
         }
 
-        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
-        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
-        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
+        if (mbuf->pkt_len - mbuf->l2_len > 1450) {
+            dp_packet_hwol_set_tcp_seg(pkt);
+        }
 
-        if (mbuf->ol_flags & PKT_TX_IPV4) {
-            mbuf->ol_flags |= PKT_TX_IP_CKSUM;
+        mbuf->ol_flags |= PKT_TX_TCP_CKSUM;
+        if (mbuf->ol_flags & PKT_TX_TCP_SEG) {
+            //mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
+            mbuf->tso_segsz = 1450 - mbuf->l3_len - mbuf->l4_len;
+        } else {
+            mbuf->tso_segsz = 0;
         }
     }
     return true;
@@ -2365,6 +2445,71 @@  netdev_dpdk_vhost_update_rx_counters(struct netdev_dpdk *dev,
     }
 }
 
+static void
+netdev_linux_parse_l2(struct dp_packet *pkt, uint16_t *l4_proto)
+{
+    struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt;
+    struct rte_ether_hdr *eth_hdr =
+                rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
+    ovs_be16 eth_type;
+    int l2_len;
+    int l3_len = 0;
+    int l4_len = 0;
+
+    l2_len = ETH_HEADER_LEN;
+    eth_type = eth_hdr->ether_type;
+    if (eth_type_vlan(eth_type)) {
+        struct rte_vlan_hdr *vlan_hdr =
+                        (struct rte_vlan_hdr *)(eth_hdr + 1);
+
+        eth_type = vlan_hdr->eth_proto;
+        l2_len += VLAN_HEADER_LEN;
+    }
+
+    dp_packet_hwol_set_l2_len(pkt, l2_len);
+
+    if (eth_type == htons(ETH_TYPE_IP)) {
+        struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *)
+            ((char *)eth_hdr + l2_len);
+
+        l3_len = IP_HEADER_LEN;
+        dp_packet_hwol_set_tx_ipv4(pkt);
+        *l4_proto = ipv4_hdr->next_proto_id;
+    } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) {
+        struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *)
+            ((char *)eth_hdr + l2_len);
+        l3_len = IPV6_HEADER_LEN;
+        dp_packet_hwol_set_tx_ipv6(pkt);
+        *l4_proto = ipv6_hdr->proto;
+    }
+
+    dp_packet_hwol_set_l3_len(pkt, l3_len);
+
+    if (*l4_proto == IPPROTO_TCP) {
+        struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *)
+            ((char *)eth_hdr + l2_len + l3_len);
+
+        l4_len = (tcp_hdr->data_off & 0xf0) >> 2;
+        dp_packet_hwol_set_l4_len(pkt, l4_len);
+    }
+}
+
+static void
+netdev_dpdk_parse_hdr(struct dp_packet *b)
+{
+    uint16_t l4_proto = 0;
+
+    netdev_linux_parse_l2(b, &l4_proto);
+
+    if (l4_proto == IPPROTO_TCP) {
+        dp_packet_hwol_set_csum_tcp(b);
+    } else if (l4_proto == IPPROTO_UDP) {
+        dp_packet_hwol_set_csum_udp(b);
+    } else if (l4_proto == IPPROTO_SCTP) {
+        dp_packet_hwol_set_csum_sctp(b);
+    }
+}
+
 /*
  * The receive path for the vhost port is the TX path out from guest.
  */
@@ -2378,6 +2523,7 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
     uint16_t qos_drops = 0;
     int qid = rxq->queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
     int vid = netdev_dpdk_get_vid(dev);
+    struct dp_packet *packet;
 
     if (OVS_UNLIKELY(vid < 0 || !dev->vhost_reconfigured
                      || !(dev->flags & NETDEV_UP))) {
@@ -2417,6 +2563,14 @@  netdev_dpdk_vhost_rxq_recv(struct netdev_rxq *rxq,
     batch->count = nb_rx;
     dp_packet_batch_init_packet_fields(batch);
 
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct rte_mbuf *mbuf = (struct rte_mbuf *)packet;
+
+        /* Clear ol_flags and set it by parsing header */
+        mbuf->ol_flags = 0;
+        netdev_dpdk_parse_hdr(packet);
+    }
+
     return 0;
 }
 
@@ -2737,13 +2891,18 @@  dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, struct dp_packet *pkt_orig)
 
     mbuf_dest->tx_offload = pkt_orig->mbuf.tx_offload;
     mbuf_dest->packet_type = pkt_orig->mbuf.packet_type;
-    mbuf_dest->ol_flags |= (pkt_orig->mbuf.ol_flags &
-                            ~(EXT_ATTACHED_MBUF | IND_ATTACHED_MBUF));
+    mbuf_dest->ol_flags |= pkt_orig->mbuf.ol_flags;
+    mbuf_dest->l2_len = pkt_orig->mbuf.l2_len;
+    mbuf_dest->l3_len = pkt_orig->mbuf.l3_len;
+    mbuf_dest->l4_len = pkt_orig->mbuf.l4_len;
+    mbuf_dest->outer_l2_len = pkt_orig->mbuf.outer_l2_len;
+    mbuf_dest->outer_l3_len = pkt_orig->mbuf.outer_l3_len;
 
     memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
            sizeof(struct dp_packet) - offsetof(struct dp_packet, l2_pad_size));
 
-    if (mbuf_dest->ol_flags & PKT_TX_L4_MASK) {
+    if ((mbuf_dest->outer_l2_len == 0) &&
+        (mbuf_dest->ol_flags & PKT_TX_L4_MASK)) {
         mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
                                 - (char *)dp_packet_eth(pkt_dest);
         mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
@@ -2773,6 +2932,7 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
     uint32_t tx_failure = 0;
     uint32_t mtu_drops = 0;
     uint32_t qos_drops = 0;
+    struct rte_mbuf *mbuf;
 
     if (dev->type != DPDK_DEV_VHOST) {
         /* Check if QoS has been configured for this netdev. */
@@ -2801,6 +2961,9 @@  dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet_batch *batch)
             break;
         }
 
+        mbuf = (struct rte_mbuf *)pkts[txcnt];
+        netdev_dpdk_prep_hwol_packet(dev, mbuf);
+
         txcnt++;
     }
 
@@ -4949,6 +5112,10 @@  netdev_dpdk_reconfigure(struct netdev *netdev)
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM;
         netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
+        /* Enable VXLAN TSO support if available */
+        if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+            netdev->ol_flags |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+        }
         if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
             netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM;
         }
diff --git a/lib/netdev.c b/lib/netdev.c
index 90962ee..378f04f 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -960,18 +960,12 @@  netdev_push_header(const struct netdev *netdev,
     size_t i, size = dp_packet_batch_size(batch);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet)
-                         || dp_packet_hwol_l4_mask(packet))) {
-            COVERAGE_INC(netdev_push_header_drops);
-            dp_packet_delete(packet);
-            VLOG_WARN_RL(&rl, "%s: Tunneling packets with HW offload flags is "
-                         "not supported: packet dropped",
-                         netdev_get_name(netdev));
-        } else {
-            netdev->netdev_class->push_header(netdev, packet, data);
-            pkt_metadata_init(&packet->md, data->out_port);
-            dp_packet_batch_refill(batch, packet, i);
+        netdev->netdev_class->push_header(netdev, packet, data);
+        if (data->tnl_type == OVS_VPORT_TYPE_VXLAN) {
+            dp_packet_hwol_set_vxlan_tcp_seg(packet);
         }
+        pkt_metadata_init(&packet->md, data->out_port);
+        dp_packet_batch_refill(batch, packet, i);
     }
 
     return 0;