diff mbox series

[ovs-dev,v3] netdev-dpdk: Add vxlan and geneve tunnel tso.

Message ID 20230925013817.171-1-dexia.li@jaguarmicro.com
State Superseded
Headers show
Series [ovs-dev,v3] netdev-dpdk: Add vxlan and geneve tunnel tso. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Dexia Li Sept. 25, 2023, 1:38 a.m. UTC
For dpdk datapath, this patch provides vxlan and geneve tunnel tso.
Only support userspace vxlan or geneve tunnel, meanwhile support
tunnel outter and inner csum offload. if netdev do not support offload
features, there is a software fallback.

Signed-off-by: Dexia Li <dexia.li@jaguarmicro.com>
---
 lib/dp-packet.h         | 165 ++++++++++++++++++++++++++++++++++++----
 lib/netdev-dpdk.c       | 100 +++++++++++++++++++++---
 lib/netdev-native-tnl.c | 106 +++++++++++++++++++++++---
 lib/netdev-native-tnl.h |   5 +-
 lib/netdev-provider.h   |   4 +
 lib/netdev.c            |  34 ++++++---
 tests/dpif-netdev.at    |   4 +-
 7 files changed, 369 insertions(+), 49 deletions(-)

Comments

David Marchand Sept. 25, 2023, 10:15 a.m. UTC | #1
Hello,

Quick (unfinished) comments, I will try to have a better look in the
next weeks (but I am busy with 23.11).


On Mon, Sep 25, 2023 at 3:38 AM Dexia Li <dexia.li@jaguarmicro.com> wrote:
>
> For dpdk datapath, this patch provides vxlan and geneve tunnel tso.

I suppose you are talking about the userspace datapath, using DPDK support.
There is no dpdk datapath.


> Only support userspace vxlan or geneve tunnel, meanwhile support
> tunnel outter and inner csum offload. if netdev do not support offload
> features, there is a software fallback.

Can you provide performance numbers of the scenario you tested?
How does this patch impact performance with simple traffic?
Did you check L3 checksum, L4 checksum still work without tunnels?


>
> Signed-off-by: Dexia Li <dexia.li@jaguarmicro.com>
> ---
>  lib/dp-packet.h         | 165 ++++++++++++++++++++++++++++++++++++----
>  lib/netdev-dpdk.c       | 100 +++++++++++++++++++++---
>  lib/netdev-native-tnl.c | 106 +++++++++++++++++++++++---
>  lib/netdev-native-tnl.h |   5 +-
>  lib/netdev-provider.h   |   4 +
>  lib/netdev.c            |  34 ++++++---
>  tests/dpif-netdev.at    |   4 +-
>  7 files changed, 369 insertions(+), 49 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 70ddf8aa4..4b5883dba 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -86,22 +86,43 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
>      /* Offload IP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000),
> +    /* Offload packet is tunnel GENEVE. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE,
> +                RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000),
> +    /* Offload packet is tunnel VXLAN. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN,
> +                RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000),
> +    /* Offload tunnel packet, out is ipv4 */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4,
> +                RTE_MBUF_F_TX_OUTER_IPV4, 0x8000),
> +    /* Offload TUNNEL out ipv4 checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000),
> +    /* Offload TUNNEL out udp checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000),
> +
>      /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
>  };
>
> -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH         | \
> -                                     DP_PACKET_OL_FLOW_MARK        | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_TX_TCP_SEG       | \
> -                                     DP_PACKET_OL_TX_IPV4          | \
> -                                     DP_PACKET_OL_TX_IPV6          | \
> -                                     DP_PACKET_OL_TX_TCP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> -                                     DP_PACKET_OL_TX_IP_CKSUM)
> +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH           | \
> +                                     DP_PACKET_OL_FLOW_MARK          | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_TX_TCP_SEG         | \
> +                                     DP_PACKET_OL_TX_IPV4            | \
> +                                     DP_PACKET_OL_TX_IPV6            | \
> +                                     DP_PACKET_OL_TX_TCP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_UDP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM      | \
> +                                     DP_PACKET_OL_TX_IP_CKSUM        | \
> +                                     DP_PACKET_OL_TX_TUNNEL_GENEVE   | \
> +                                     DP_PACKET_OL_TX_TUNNEL_VXLAN    | \
> +                                     DP_PACKET_OL_TX_OUTER_IPV4      | \
> +                                     DP_PACKET_OL_TX_OUTER_IP_CKSUM  | \
> +                                     DP_PACKET_OL_TX_OUTER_UDP_CKSUM)
>
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \
> @@ -535,6 +556,25 @@ dp_packet_get_nd_payload(const struct dp_packet *b)
>  }
>
>  #ifdef DPDK_NETDEV
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len)
> +{
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len)
> +{
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len)
> +{
> +    b->mbuf.l4_len = l4_len;
> +}
> +
> +

Exposing DPDK specific fields looks incorrect to me.
This should be abstracted in some way so that OVS "generic" code is
unaware of what runs underneath.


>  static inline uint64_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)
>  {
> @@ -554,6 +594,24 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b)
>  }
>
>  #else
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len OVS_UNUSED)
> +{
> +    /* There are no implementation */
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len OVS_UNUSED)
> +{
> +    /* There are no implementation */
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len OVS_UNUSED)
> +{
> +    /* There are no implementation */
> +}
> +

What happens if you run this code without DPDK?
Note that those empty stubs would not be needed with a proper abstraction.


>  static inline uint32_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)
>  {
> @@ -615,9 +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
>       * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
>       * loss of accuracy in assigning 'v' to 'data_len'.
>       */
> -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> -    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
> -                                      * this segment. */
> +    /* Current seg length. */
> +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);

What is this change for?


> +    /* Total length of all segments linked to this segment. */
> +    b->mbuf.pkt_len = v;
>  }
>
>  static inline uint16_t
> @@ -1030,6 +1089,43 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
>              DP_PACKET_OL_TX_SCTP_CKSUM;
>  }
>
> +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_GENEVE);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4. */
> +static inline bool
> +dp_packet_hwol_is_outer_ipv4(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV4);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4 csum offload. */
> +static inline bool
> +dp_packet_hwol_is_outer_ipv4_cksum(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IP_CKSUM);
> +}
> +
> +/* Returns 'true' if packet 'b' is marked for out udp csum offload. */
> +static inline bool
> +dp_packet_hwol_is_outer_UDP_cksum(struct dp_packet *b)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_UDP_CKSUM);
> +}
> +
>  static inline void
>  dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)
>  {
> @@ -1105,6 +1201,43 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>  }
>
> +/* Mark packet 'b' for tunnel geneve offloading.  It implies that
> + * the packet 'b' is marked for tunnel geneve offloading. */
> +static inline void
> +dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE;
> +}
> +
> +/* Mark packet 'b' for tunnel vxlan offloading.  It implies that
> + * the packet 'b' is marked for tunnel vxlan offloading. */
> +static inline void
> +dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN;
> +}
> +
> +/* Mark packet 'b' for out ipv4 packet. */
> +static inline void
> +dp_packet_hwol_set_outer_ipv4(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4;
> +}
> +
> +/* Mark packet 'b' for out ipv4 csum offloading. */
> +static inline void
> +dp_packet_hwol_set_outer_ipv4_csum(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
> +}
> +
> +/* Mark packet 'b' for out udp csum offloading. */
> +static inline void
> +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b)
> +{
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
> +}
> +
>  /* Returns 'true' if the IP header has good integrity and the
>   * checksum in it is complete. */
>  static inline bool
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index 55700250d..fbe46fc4b 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -416,6 +416,10 @@ enum dpdk_hw_ol_features {
>      NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
>      NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 7,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
> +    NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
> +    NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
> +    NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
>  };
>
>  enum dpdk_rx_steer_flags {
> @@ -1075,6 +1079,14 @@ netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
>                                     NETDEV_TX_OFFLOAD_SCTP_CKSUM);
>      netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
>                                     NETDEV_TX_OFFLOAD_TCP_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_VXLAN_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_GENEVE_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
>  }
>
>  static int
> @@ -1129,6 +1141,23 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>          conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
>      }
>
> +    if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
> +    }
> +
> +
>      /* Limit configured rss hash functions to only those supported
>       * by the eth device. */
>      conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
> @@ -1346,6 +1375,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
>      }
>
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    }
> +
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    }
> +
>      dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
>      if (userspace_tso_enabled()) {
>          if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) {
> @@ -1354,6 +1395,20 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>              VLOG_WARN("%s: Tx TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
>      }
>
>      n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
> @@ -2445,29 +2500,56 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          return true;
>      }
>
> -    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> -    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> -    mbuf->l4_len = 0;
> -    mbuf->outer_l2_len = 0;
> -    mbuf->outer_l3_len = 0;
> +    /* If packet is vxlan or geneve tunnel packet, calculate outer
> +     * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
> +     * before. */
> +    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> +        if (mbuf->ol_flags &
> +            (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> +                     (char *) dp_packet_eth(pkt);
> +            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> +                     (char *) dp_packet_l3(pkt);
> +        } else {
> +            mbuf->l2_len = (char *) dp_packet_l3(pkt) -
> +                   (char *) dp_packet_eth(pkt);
> +            mbuf->l3_len = (char *) dp_packet_l4(pkt) -
> +                   (char *) dp_packet_l3(pkt);
> +            mbuf->outer_l2_len = 0;
> +            mbuf->outer_l3_len = 0;
> +        }
> +    }

This hunk above more or less reverts 5d11c47d3ebe ("userspace: Enable
IP checksum offloading by default.").
So I suspect it breaks IP checksum when OVS only requests it (and no L4).


>
>      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
>          struct tcp_header *th = dp_packet_l4(pkt);
>
>          if (!th) {
> -            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            VLOG_WARN_RL(&rl,
> +            "%s: TCP Segmentation without L4 header,pkt len: %"PRIu32"",
> +                dev->up.name, mbuf->pkt_len);

Whitespace change damage? if so, please drop this part.


>              return false;
>          }
>
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +        if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE |
> +            RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->tso_segsz  = dev->mtu - mbuf->l2_len - mbuf->l3_len -
> +            mbuf->l4_len - mbuf->outer_l3_len;
> +        } else {
> +            mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +            mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        }
> +
>          mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
>
>          if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
>              mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
>          }
>      }
> +
> +    if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> +        && (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4))
> +        mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;

This looks really strange, please explain why L4 offloads require ip
checksumming.


> +
>      return true;
>  }
>

I did not look at the rest of the implementation for now.
Dexia Li Sept. 25, 2023, 12:08 p.m. UTC | #2
-----邮件原件-----
发件人: David Marchand <david.marchand@redhat.com> 
发送时间: 2023年9月25日 18:16
收件人: Dexia Li <dexia.li@jaguarmicro.com>
抄送: ovs-dev@openvswitch.org; i.maximets@ovn.org; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; Mike Pattrick <mkp@redhat.com>
主题: Re: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.

Hello,

Quick (unfinished) comments, I will try to have a better look in the next weeks (but I am busy with 23.11).


On Mon, Sep 25, 2023 at 3:38 AM Dexia Li <dexia.li@jaguarmicro.com> wrote:
>
> For dpdk datapath, this patch provides vxlan and geneve tunnel tso.

I suppose you are talking about the userspace datapath, using DPDK support.
There is no dpdk datapath.

Ok, I will modify it.

> Only support userspace vxlan or geneve tunnel, meanwhile support 
> tunnel outter and inner csum offload. if netdev do not support offload 
> features, there is a software fallback.

Can you provide performance numbers of the scenario you tested?
About 50% increase in bandwidth. Pps also has a obvious increase. 

How does this patch impact performance with simple traffic?
It has no obvious impact on performance with simple traffic. 

Did you check L3 checksum, L4 checksum still work without tunnels?
Yes, it still works without tunnels. we also need non-tunnel scenarios.


Best regards
Dexia

>
> Signed-off-by: Dexia Li <dexia.li@jaguarmicro.com>
> ---
>  lib/dp-packet.h         | 165 ++++++++++++++++++++++++++++++++++++----
>  lib/netdev-dpdk.c       | 100 +++++++++++++++++++++---
>  lib/netdev-native-tnl.c | 106 +++++++++++++++++++++++---
>  lib/netdev-native-tnl.h |   5 +-
>  lib/netdev-provider.h   |   4 +
>  lib/netdev.c            |  34 ++++++---
>  tests/dpif-netdev.at    |   4 +-
>  7 files changed, 369 insertions(+), 49 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 
> 70ddf8aa4..4b5883dba 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -86,22 +86,43 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
>      /* Offload IP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 
> 0x1000),
> +    /* Offload packet is tunnel GENEVE. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE,
> +                RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000),
> +    /* Offload packet is tunnel VXLAN. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN,
> +                RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000),
> +    /* Offload tunnel packet, out is ipv4 */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4,
> +                RTE_MBUF_F_TX_OUTER_IPV4, 0x8000),
> +    /* Offload TUNNEL out ipv4 checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000),
> +    /* Offload TUNNEL out udp checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000),
> +
>      /* Adding new field requires adding to 
> DP_PACKET_OL_SUPPORTED_MASK. */  };
>
> -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH         | \
> -                                     DP_PACKET_OL_FLOW_MARK        | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_TX_TCP_SEG       | \
> -                                     DP_PACKET_OL_TX_IPV4          | \
> -                                     DP_PACKET_OL_TX_IPV6          | \
> -                                     DP_PACKET_OL_TX_TCP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> -                                     DP_PACKET_OL_TX_IP_CKSUM)
> +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH           | \
> +                                     DP_PACKET_OL_FLOW_MARK          | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_TX_TCP_SEG         | \
> +                                     DP_PACKET_OL_TX_IPV4            | \
> +                                     DP_PACKET_OL_TX_IPV6            | \
> +                                     DP_PACKET_OL_TX_TCP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_UDP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM      | \
> +                                     DP_PACKET_OL_TX_IP_CKSUM        | \
> +                                     DP_PACKET_OL_TX_TUNNEL_GENEVE   | \
> +                                     DP_PACKET_OL_TX_TUNNEL_VXLAN    | \
> +                                     DP_PACKET_OL_TX_OUTER_IPV4      | \
> +                                     DP_PACKET_OL_TX_OUTER_IP_CKSUM  | \
> +                                     DP_PACKET_OL_TX_OUTER_UDP_CKSUM)
>
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \ @@ 
> -535,6 +556,25 @@ dp_packet_get_nd_payload(const struct dp_packet *b)  
> }
>
>  #ifdef DPDK_NETDEV
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len) {
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len) {
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len) {
> +    b->mbuf.l4_len = l4_len;
> +}
> +
> +

Exposing DPDK specific fields looks incorrect to me.
This should be abstracted in some way so that OVS "generic" code is unaware of what runs underneath.


>  static inline uint64_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -554,6 
> +594,24 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b)  }
>
>  #else
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len 
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len 
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len 
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +

What happens if you run this code without DPDK?
Note that those empty stubs would not be needed with a proper abstraction.


>  static inline uint32_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -615,9 
> +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
>       * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
>       * loss of accuracy in assigning 'v' to 'data_len'.
>       */
> -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> -    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
> -                                      * this segment. */
> +    /* Current seg length. */
> +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);

What is this change for?


> +    /* Total length of all segments linked to this segment. */
> +    b->mbuf.pkt_len = v;
>  }
>
>  static inline uint16_t
> @@ -1030,6 +1089,43 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
>              DP_PACKET_OL_TX_SCTP_CKSUM;  }
>
> +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_TUNNEL_GENEVE); }
> +
> +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_TUNNEL_VXLAN); }
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4. */ static 
> +inline bool dp_packet_hwol_is_outer_ipv4(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_IPV4); }
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4 csum offload. 
> +*/ static inline bool dp_packet_hwol_is_outer_ipv4_cksum(struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_IP_CKSUM); }
> +
> +/* Returns 'true' if packet 'b' is marked for out udp csum offload. 
> +*/ static inline bool dp_packet_hwol_is_outer_UDP_cksum(struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_UDP_CKSUM); }
> +
>  static inline void
>  dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)  { @@ -1105,6 
> +1201,43 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;  }
>
> +/* Mark packet 'b' for tunnel geneve offloading.  It implies that
> + * the packet 'b' is marked for tunnel geneve offloading. */ static 
> +inline void dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE; }
> +
> +/* Mark packet 'b' for tunnel vxlan offloading.  It implies that
> + * the packet 'b' is marked for tunnel vxlan offloading. */ static 
> +inline void dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; }
> +
> +/* Mark packet 'b' for out ipv4 packet. */ static inline void 
> +dp_packet_hwol_set_outer_ipv4(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4; }
> +
> +/* Mark packet 'b' for out ipv4 csum offloading. */ static inline 
> +void dp_packet_hwol_set_outer_ipv4_csum(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; }
> +
> +/* Mark packet 'b' for out udp csum offloading. */ static inline void 
> +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; }
> +
>  /* Returns 'true' if the IP header has good integrity and the
>   * checksum in it is complete. */
>  static inline bool
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 
> 55700250d..fbe46fc4b 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -416,6 +416,10 @@ enum dpdk_hw_ol_features {
>      NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
>      NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 7,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
> +    NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
> +    NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
> +    NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
>  };
>
>  enum dpdk_rx_steer_flags {
> @@ -1075,6 +1079,14 @@ netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
>                                     NETDEV_TX_OFFLOAD_SCTP_CKSUM);
>      netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
>                                     NETDEV_TX_OFFLOAD_TCP_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_VXLAN_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_GENEVE_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
> +                                   
> + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
>  }
>
>  static int
> @@ -1129,6 +1141,23 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>          conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
>      }
>
> +    if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
> +    }
> +
> +
>      /* Limit configured rss hash functions to only those supported
>       * by the eth device. */
>      conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads; 
> @@ -1346,6 +1375,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
>      }
>
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    }
> +
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    }
> +
>      dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
>      if (userspace_tso_enabled()) {
>          if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { @@ 
> -1354,6 +1395,20 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>              VLOG_WARN("%s: Tx TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
>      }
>
>      n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq); @@ -2445,29 
> +2500,56 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          return true;
>      }
>
> -    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> -    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> -    mbuf->l4_len = 0;
> -    mbuf->outer_l2_len = 0;
> -    mbuf->outer_l3_len = 0;
> +    /* If packet is vxlan or geneve tunnel packet, calculate outer
> +     * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
> +     * before. */
> +    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> +        if (mbuf->ol_flags &
> +            (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> +                     (char *) dp_packet_eth(pkt);
> +            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> +                     (char *) dp_packet_l3(pkt);
> +        } else {
> +            mbuf->l2_len = (char *) dp_packet_l3(pkt) -
> +                   (char *) dp_packet_eth(pkt);
> +            mbuf->l3_len = (char *) dp_packet_l4(pkt) -
> +                   (char *) dp_packet_l3(pkt);
> +            mbuf->outer_l2_len = 0;
> +            mbuf->outer_l3_len = 0;
> +        }
> +    }

This hunk above more or less reverts 5d11c47d3ebe ("userspace: Enable IP checksum offloading by default.").
So I suspect it breaks IP checksum when OVS only requests it (and no L4).


>
>      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
>          struct tcp_header *th = dp_packet_l4(pkt);
>
>          if (!th) {
> -            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            VLOG_WARN_RL(&rl,
> +            "%s: TCP Segmentation without L4 header,pkt len: %"PRIu32"",
> +                dev->up.name, mbuf->pkt_len);

Whitespace change damage? if so, please drop this part.


>              return false;
>          }
>
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +        if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE |
> +            RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->tso_segsz  = dev->mtu - mbuf->l2_len - mbuf->l3_len -
> +            mbuf->l4_len - mbuf->outer_l3_len;
> +        } else {
> +            mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +            mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        }
> +
>          mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
>
>          if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
>              mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
>          }
>      }
> +
> +    if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> +        && (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4))
> +        mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;

This looks really strange, please explain why L4 offloads require ip checksumming.


> +
>      return true;
>  }
>

I did not look at the rest of the implementation for now.



--
David Marchand
Dexia Li Sept. 27, 2023, 9:28 a.m. UTC | #3
-----邮件原件-----
发件人: Dexia Li 
发送时间: 2023年9月25日 20:08
收件人: David Marchand <david.marchand@redhat.com>
抄送: ovs-dev@openvswitch.org; i.maximets@ovn.org; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; Mike Pattrick <mkp@redhat.com>
主题: 答复: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.



-----邮件原件-----
发件人: David Marchand <david.marchand@redhat.com>
发送时间: 2023年9月25日 18:16
收件人: Dexia Li <dexia.li@jaguarmicro.com>
抄送: ovs-dev@openvswitch.org; i.maximets@ovn.org; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; Mike Pattrick <mkp@redhat.com>
主题: Re: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.

Hello,

Quick (unfinished) comments, I will try to have a better look in the next weeks (but I am busy with 23.11).

Hello, David

Sorry for missing something.

On Mon, Sep 25, 2023 at 3:38 AM Dexia Li <dexia.li@jaguarmicro.com> wrote:
>
> For dpdk datapath, this patch provides vxlan and geneve tunnel tso.

I suppose you are talking about the userspace datapath, using DPDK support.
There is no dpdk datapath.

Ok, I have corrected.

> Only support userspace vxlan or geneve tunnel, meanwhile support 
> tunnel outter and inner csum offload. if netdev do not support offload 
> features, there is a software fallback.

Can you provide performance numbers of the scenario you tested?
About 50% increase in bandwidth. Pps also has a obvious increase. 

How does this patch impact performance with simple traffic?
It has no obvious impact on performance with simple traffic. 

Did you check L3 checksum, L4 checksum still work without tunnels?
Yes, it still works without tunnels. we also need non-tunnel scenarios.



>
> Signed-off-by: Dexia Li <dexia.li@jaguarmicro.com>
> ---
>  lib/dp-packet.h         | 165 ++++++++++++++++++++++++++++++++++++----
>  lib/netdev-dpdk.c       | 100 +++++++++++++++++++++---
>  lib/netdev-native-tnl.c | 106 +++++++++++++++++++++++---
>  lib/netdev-native-tnl.h |   5 +-
>  lib/netdev-provider.h   |   4 +
>  lib/netdev.c            |  34 ++++++---
>  tests/dpif-netdev.at    |   4 +-
>  7 files changed, 369 insertions(+), 49 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 
> 70ddf8aa4..4b5883dba 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -86,22 +86,43 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
>      /* Offload IP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 
> 0x1000),
> +    /* Offload packet is tunnel GENEVE. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE,
> +                RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000),
> +    /* Offload packet is tunnel VXLAN. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN,
> +                RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000),
> +    /* Offload tunnel packet, out is ipv4 */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4,
> +                RTE_MBUF_F_TX_OUTER_IPV4, 0x8000),
> +    /* Offload TUNNEL out ipv4 checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000),
> +    /* Offload TUNNEL out udp checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000),
> +
>      /* Adding new field requires adding to 
> DP_PACKET_OL_SUPPORTED_MASK. */  };
>
> -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH         | \
> -                                     DP_PACKET_OL_FLOW_MARK        | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_TX_TCP_SEG       | \
> -                                     DP_PACKET_OL_TX_IPV4          | \
> -                                     DP_PACKET_OL_TX_IPV6          | \
> -                                     DP_PACKET_OL_TX_TCP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> -                                     DP_PACKET_OL_TX_IP_CKSUM)
> +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH           | \
> +                                     DP_PACKET_OL_FLOW_MARK          | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_TX_TCP_SEG         | \
> +                                     DP_PACKET_OL_TX_IPV4            | \
> +                                     DP_PACKET_OL_TX_IPV6            | \
> +                                     DP_PACKET_OL_TX_TCP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_UDP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM      | \
> +                                     DP_PACKET_OL_TX_IP_CKSUM        | \
> +                                     DP_PACKET_OL_TX_TUNNEL_GENEVE   | \
> +                                     DP_PACKET_OL_TX_TUNNEL_VXLAN    | \
> +                                     DP_PACKET_OL_TX_OUTER_IPV4      | \
> +                                     DP_PACKET_OL_TX_OUTER_IP_CKSUM  | \
> +                                     DP_PACKET_OL_TX_OUTER_UDP_CKSUM)
>
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \ @@
> -535,6 +556,25 @@ dp_packet_get_nd_payload(const struct dp_packet *b) 
> }
>
>  #ifdef DPDK_NETDEV
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len) {
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len) {
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len) {
> +    b->mbuf.l4_len = l4_len;
> +}
> +
> +

Exposing DPDK specific fields looks incorrect to me.
This should be abstracted in some way so that OVS "generic" code is unaware of what runs underneath.

Here is a compile macro used to distinguish between dpdk netdev and linux kernel.
I follow the example func dp_packet_base. 


>  static inline uint64_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -554,6
> +594,24 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b)  }
>
>  #else
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +

What happens if you run this code without DPDK?
Note that those empty stubs would not be needed with a proper abstraction.


Currently, I think only userspace vport class use this func in tunnel encap. Kernel linux tunnel 
Encap does not use it. Those empty stubs exist here for avoiding compiling errors.
I will check it without DPDK.

>  static inline uint32_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -615,9 
> +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
>       * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
>       * loss of accuracy in assigning 'v' to 'data_len'.
>       */
> -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> -    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
> -                                      * this segment. */
> +    /* Current seg length. */
> +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);

What is this change for?

For multi-seg packets, pkt_len and data_len of the first seg are resized because tunnel encap.
The 128 Bytes headroom are reserved between buff_addr and data_off of mbuf. Then data_off move forward
for the len of tunnel encap. So the first seg of the multi-seg packet should resize for data_len. 
At this time, the data_len of the first seg is 2048(RTE_MBUF_DEFAULT_DATAROOM) plus tunnel encap len. 
Before the resize, the data_len of the first seg is 2048. The dataroom is full of the first seg.
 
It will result in tunnel packets tso error if the data_len of the first seg equals pkt_len. In my test,
This will result in the last packet of seg packets can not appear. 



> +    /* Total length of all segments linked to this segment. */
> +    b->mbuf.pkt_len = v;
>  }
>
>  static inline uint16_t
> @@ -1030,6 +1089,43 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
>              DP_PACKET_OL_TX_SCTP_CKSUM;  }
>
> +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_TUNNEL_GENEVE); }
> +
> +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_TUNNEL_VXLAN); }
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4. */ static 
> +inline bool dp_packet_hwol_is_outer_ipv4(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_IPV4); }
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4 csum offload. 
> +*/ static inline bool dp_packet_hwol_is_outer_ipv4_cksum(struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_IP_CKSUM); }
> +
> +/* Returns 'true' if packet 'b' is marked for out udp csum offload. 
> +*/ static inline bool dp_packet_hwol_is_outer_UDP_cksum(struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_UDP_CKSUM); }
> +
>  static inline void
>  dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)  { @@ -1105,6 
> +1201,43 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;  }
>
> +/* Mark packet 'b' for tunnel geneve offloading.  It implies that
> + * the packet 'b' is marked for tunnel geneve offloading. */ static 
> +inline void dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE; }
> +
> +/* Mark packet 'b' for tunnel vxlan offloading.  It implies that
> + * the packet 'b' is marked for tunnel vxlan offloading. */ static 
> +inline void dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; }
> +
> +/* Mark packet 'b' for out ipv4 packet. */ static inline void 
> +dp_packet_hwol_set_outer_ipv4(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4; }
> +
> +/* Mark packet 'b' for out ipv4 csum offloading. */ static inline 
> +void dp_packet_hwol_set_outer_ipv4_csum(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; }
> +
> +/* Mark packet 'b' for out udp csum offloading. */ static inline void 
> +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; }
> +
>  /* Returns 'true' if the IP header has good integrity and the
>   * checksum in it is complete. */
>  static inline bool
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 
> 55700250d..fbe46fc4b 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -416,6 +416,10 @@ enum dpdk_hw_ol_features {
>      NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
>      NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 7,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
> +    NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
> +    NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
> +    NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
>  };
>
>  enum dpdk_rx_steer_flags {
> @@ -1075,6 +1079,14 @@ netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
>                                     NETDEV_TX_OFFLOAD_SCTP_CKSUM);
>      netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
>                                     NETDEV_TX_OFFLOAD_TCP_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_VXLAN_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_GENEVE_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
> +                                   
> + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
>  }
>
>  static int
> @@ -1129,6 +1141,23 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>          conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
>      }
>
> +    if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
> +    }
> +
> +
>      /* Limit configured rss hash functions to only those supported
>       * by the eth device. */
>      conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads; 
> @@ -1346,6 +1375,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
>      }
>
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    }
> +
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    }
> +
>      dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
>      if (userspace_tso_enabled()) {
>          if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { @@ 
> -1354,6 +1395,20 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>              VLOG_WARN("%s: Tx TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
>      }
>
>      n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq); @@ -2445,29 
> +2500,56 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          return true;
>      }
>
> -    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> -    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> -    mbuf->l4_len = 0;
> -    mbuf->outer_l2_len = 0;
> -    mbuf->outer_l3_len = 0;
> +    /* If packet is vxlan or geneve tunnel packet, calculate outer
> +     * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
> +     * before. */
> +    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> +        if (mbuf->ol_flags &
> +            (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> +                     (char *) dp_packet_eth(pkt);
> +            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> +                     (char *) dp_packet_l3(pkt);
> +        } else {
> +            mbuf->l2_len = (char *) dp_packet_l3(pkt) -
> +                   (char *) dp_packet_eth(pkt);
> +            mbuf->l3_len = (char *) dp_packet_l4(pkt) -
> +                   (char *) dp_packet_l3(pkt);
> +            mbuf->outer_l2_len = 0;
> +            mbuf->outer_l3_len = 0;
> +        }
> +    }

This hunk above more or less reverts 5d11c47d3ebe ("userspace: Enable IP checksum offloading by default.").
So I suspect it breaks IP checksum when OVS only requests it (and no L4).

I get your meaning. I will modify the condition statement.

>
>      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
>          struct tcp_header *th = dp_packet_l4(pkt);
>
>          if (!th) {
> -            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            VLOG_WARN_RL(&rl,
> +            "%s: TCP Segmentation without L4 header,pkt len: %"PRIu32"",
> +                dev->up.name, mbuf->pkt_len);

Whitespace change damage? if so, please drop this part.

Ok.


>              return false;
>          }
>
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +        if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE |
> +            RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->tso_segsz  = dev->mtu - mbuf->l2_len - mbuf->l3_len -
> +            mbuf->l4_len - mbuf->outer_l3_len;
> +        } else {
> +            mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +            mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        }
> +
>          mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
>
>          if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
>              mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
>          }
>      }
> +
> +    if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> +        && (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4))
> +        mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;

This looks really strange, please explain why L4 offloads require ip checksumming.

This statement is to make sure not missing RTE_MBUF_F_TX_IP_CKSUM. 
Since ip checksum is enabled default, this can be removed.


> +
>      return true;
>  }
>

I did not look at the rest of the implementation for now.



--
David Marchand


Thanks a lot, David.


Best regards
Dexia
Dexia Li Sept. 27, 2023, 9:35 a.m. UTC | #4
-----邮件原件-----
发件人: Dexia Li 
发送时间: 2023年9月27日 17:29
收件人: 'David Marchand' <david.marchand@redhat.com>
抄送: 'ovs-dev@openvswitch.org' <ovs-dev@openvswitch.org>; 'i.maximets@ovn.org' <i.maximets@ovn.org>; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; 'Mike Pattrick' <mkp@redhat.com>
主题: 答复: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.



-----邮件原件-----
发件人: Dexia Li 
发送时间: 2023年9月25日 20:08
收件人: David Marchand <david.marchand@redhat.com>
抄送: ovs-dev@openvswitch.org; i.maximets@ovn.org; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; Mike Pattrick <mkp@redhat.com>
主题: 答复: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.



-----邮件原件-----
发件人: David Marchand <david.marchand@redhat.com>
发送时间: 2023年9月25日 18:16
收件人: Dexia Li <dexia.li@jaguarmicro.com>
抄送: ovs-dev@openvswitch.org; i.maximets@ovn.org; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; Mike Pattrick <mkp@redhat.com>
主题: Re: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.

Hello,

Quick (unfinished) comments, I will try to have a better look in the next weeks (but I am busy with 23.11).

Hello, David

Sorry for missing something.

On Mon, Sep 25, 2023 at 3:38 AM Dexia Li <dexia.li@jaguarmicro.com> wrote:
>
> For dpdk datapath, this patch provides vxlan and geneve tunnel tso.

I suppose you are talking about the userspace datapath, using DPDK support.
There is no dpdk datapath.

Ok, I have corrected.

> Only support userspace vxlan or geneve tunnel, meanwhile support 
> tunnel outter and inner csum offload. if netdev do not support offload 
> features, there is a software fallback.

Can you provide performance numbers of the scenario you tested?
About 50% increase in bandwidth. Pps also has a obvious increase. 

How does this patch impact performance with simple traffic?
It has no obvious impact on performance with simple traffic. 

Did you check L3 checksum, L4 checksum still work without tunnels?
Yes, it still works without tunnels. we also need non-tunnel scenarios.



>
> Signed-off-by: Dexia Li <dexia.li@jaguarmicro.com>
> ---
>  lib/dp-packet.h         | 165 ++++++++++++++++++++++++++++++++++++----
>  lib/netdev-dpdk.c       | 100 +++++++++++++++++++++---
>  lib/netdev-native-tnl.c | 106 +++++++++++++++++++++++---
>  lib/netdev-native-tnl.h |   5 +-
>  lib/netdev-provider.h   |   4 +
>  lib/netdev.c            |  34 ++++++---
>  tests/dpif-netdev.at    |   4 +-
>  7 files changed, 369 insertions(+), 49 deletions(-)
>
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 
> 70ddf8aa4..4b5883dba 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -86,22 +86,43 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
>      /* Offload IP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 
> 0x1000),
> +    /* Offload packet is tunnel GENEVE. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE,
> +                RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000),
> +    /* Offload packet is tunnel VXLAN. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN,
> +                RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000),
> +    /* Offload tunnel packet, out is ipv4 */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4,
> +                RTE_MBUF_F_TX_OUTER_IPV4, 0x8000),
> +    /* Offload TUNNEL out ipv4 checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000),
> +    /* Offload TUNNEL out udp checksum */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM,
> +                RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000),
> +
>      /* Adding new field requires adding to 
> DP_PACKET_OL_SUPPORTED_MASK. */  };
>
> -#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH         | \
> -                                     DP_PACKET_OL_FLOW_MARK        | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_BAD  | \
> -                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD | \
> -                                     DP_PACKET_OL_TX_TCP_SEG       | \
> -                                     DP_PACKET_OL_TX_IPV4          | \
> -                                     DP_PACKET_OL_TX_IPV6          | \
> -                                     DP_PACKET_OL_TX_TCP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> -                                     DP_PACKET_OL_TX_IP_CKSUM)
> +#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH           | \
> +                                     DP_PACKET_OL_FLOW_MARK          | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_BAD    | \
> +                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD   | \
> +                                     DP_PACKET_OL_TX_TCP_SEG         | \
> +                                     DP_PACKET_OL_TX_IPV4            | \
> +                                     DP_PACKET_OL_TX_IPV6            | \
> +                                     DP_PACKET_OL_TX_TCP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_UDP_CKSUM       | \
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM      | \
> +                                     DP_PACKET_OL_TX_IP_CKSUM        | \
> +                                     DP_PACKET_OL_TX_TUNNEL_GENEVE   | \
> +                                     DP_PACKET_OL_TX_TUNNEL_VXLAN    | \
> +                                     DP_PACKET_OL_TX_OUTER_IPV4      | \
> +                                     DP_PACKET_OL_TX_OUTER_IP_CKSUM  | \
> +                                     DP_PACKET_OL_TX_OUTER_UDP_CKSUM)
>
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \ @@
> -535,6 +556,25 @@ dp_packet_get_nd_payload(const struct dp_packet *b) 
> }
>
>  #ifdef DPDK_NETDEV
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len) {
> +    b->mbuf.l2_len = l2_len;
> +}
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len) {
> +    b->mbuf.l3_len = l3_len;
> +}
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len) {
> +    b->mbuf.l4_len = l4_len;
> +}
> +
> +

Exposing DPDK specific fields looks incorrect to me.
This should be abstracted in some way so that OVS "generic" code is unaware of what runs underneath.

Here is a compile macro used to distinguish between dpdk netdev and linux kernel.
I follow the example func dp_packet_base. 


>  static inline uint64_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -554,6
> +594,24 @@ dp_packet_flow_mark_ptr(const struct dp_packet *b)  }
>
>  #else
> +static inline void
> +dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +
> +static inline void
> +dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +
> +static inline void
> +dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len
> +OVS_UNUSED) {
> +    /* There are no implementation */ }
> +

What happens if you run this code without DPDK?
Note that those empty stubs would not be needed with a proper abstraction.


Currently, I think only userspace vport class use this func in tunnel encap. Kernel linux tunnel 
Encap does not use it. Those empty stubs exist here for avoiding compiling errors.
I will check it without DPDK.

>  static inline uint32_t *
>  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -615,9 
> +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
>       * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
>       * loss of accuracy in assigning 'v' to 'data_len'.
>       */
> -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> -    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
> -                                      * this segment. */
> +    /* Current seg length. */
> +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);

What is this change for?

For multi-seg packets, pkt_len and data_len of the first seg are resized because tunnel encap.
The 128 Bytes headroom are reserved between buff_addr and data_off of mbuf. Then data_off move forward
for the len of tunnel encap. So the first seg of the multi-seg packet should resize for data_len. 
At this time, the data_len of the first seg is 2048(RTE_MBUF_DEFAULT_DATAROOM) plus tunnel encap len. 
Before the resize, the data_len of the first seg is 2048. The dataroom is full of the first seg.
 
It will result in tunnel packets tso error if the data_len of the first seg equals pkt_len. In my test,
This will result in the last packet of seg packets can not appear. 



> +    /* Total length of all segments linked to this segment. */
> +    b->mbuf.pkt_len = v;
>  }
>
>  static inline uint16_t
> @@ -1030,6 +1089,43 @@ dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
>              DP_PACKET_OL_TX_SCTP_CKSUM;  }
>
> +/* Returns 'true' if packet 'b' is marked for tunnel GENEVE
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_TUNNEL_GENEVE); }
> +
> +/* Returns 'true' if packet 'b' is marked for tunnel VXLAN
> + * checksum offloading. */
> +static inline bool
> +dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_TUNNEL_VXLAN); }
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4. */ static 
> +inline bool dp_packet_hwol_is_outer_ipv4(struct dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_IPV4); }
> +
> +/* Returns 'true' if packet 'b' is marked for out ipv4 csum offload. 
> +*/ static inline bool dp_packet_hwol_is_outer_ipv4_cksum(struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_IP_CKSUM); }
> +
> +/* Returns 'true' if packet 'b' is marked for out udp csum offload. 
> +*/ static inline bool dp_packet_hwol_is_outer_UDP_cksum(struct 
> +dp_packet *b) {
> +    return !!(*dp_packet_ol_flags_ptr(b) & 
> +DP_PACKET_OL_TX_OUTER_UDP_CKSUM); }
> +
>  static inline void
>  dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)  { @@ -1105,6 
> +1201,43 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;  }
>
> +/* Mark packet 'b' for tunnel geneve offloading.  It implies that
> + * the packet 'b' is marked for tunnel geneve offloading. */ static 
> +inline void dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE; }
> +
> +/* Mark packet 'b' for tunnel vxlan offloading.  It implies that
> + * the packet 'b' is marked for tunnel vxlan offloading. */ static 
> +inline void dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN; }
> +
> +/* Mark packet 'b' for out ipv4 packet. */ static inline void 
> +dp_packet_hwol_set_outer_ipv4(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4; }
> +
> +/* Mark packet 'b' for out ipv4 csum offloading. */ static inline 
> +void dp_packet_hwol_set_outer_ipv4_csum(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM; }
> +
> +/* Mark packet 'b' for out udp csum offloading. */ static inline void 
> +dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b) {
> +    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM; }
> +
>  /* Returns 'true' if the IP header has good integrity and the
>   * checksum in it is complete. */
>  static inline bool
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 
> 55700250d..fbe46fc4b 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -416,6 +416,10 @@ enum dpdk_hw_ol_features {
>      NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
>      NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
>      NETDEV_TX_TSO_OFFLOAD = 1 << 7,
> +    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
> +    NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
> +    NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
> +    NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
>  };
>
>  enum dpdk_rx_steer_flags {
> @@ -1075,6 +1079,14 @@ netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
>                                     NETDEV_TX_OFFLOAD_SCTP_CKSUM);
>      netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
>                                     NETDEV_TX_OFFLOAD_TCP_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_VXLAN_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
> +                                   NETDEV_TX_GENEVE_TNL_TSO);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
> +                                   NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
> +    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
> +                                   
> + NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
>  }
>
>  static int
> @@ -1129,6 +1141,23 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
>          conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
>      }
>
> +    if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +    }
> +
> +    if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
> +    }
> +
> +
>      /* Limit configured rss hash functions to only those supported
>       * by the eth device. */
>      conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads; 
> @@ -1346,6 +1375,18 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
>      }
>
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
> +    }
> +
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
> +    }
> +
>      dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
>      if (userspace_tso_enabled()) {
>          if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) { @@ 
> -1354,6 +1395,20 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>              VLOG_WARN("%s: Tx TSO offload is not supported.",
>                        netdev_get_name(&dev->up));
>          }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
> +
> +        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
> +            dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
> +        } else {
> +            VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
> +                      netdev_get_name(&dev->up));
> +        }
>      }
>
>      n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq); @@ -2445,29 
> +2500,56 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
>          return true;
>      }
>
> -    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> -    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> -    mbuf->l4_len = 0;
> -    mbuf->outer_l2_len = 0;
> -    mbuf->outer_l3_len = 0;
> +    /* If packet is vxlan or geneve tunnel packet, calculate outer
> +     * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
> +     * before. */
> +    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> +        if (mbuf->ol_flags &
> +            (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
> +                     (char *) dp_packet_eth(pkt);
> +            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
> +                     (char *) dp_packet_l3(pkt);
> +        } else {
> +            mbuf->l2_len = (char *) dp_packet_l3(pkt) -
> +                   (char *) dp_packet_eth(pkt);
> +            mbuf->l3_len = (char *) dp_packet_l4(pkt) -
> +                   (char *) dp_packet_l3(pkt);
> +            mbuf->outer_l2_len = 0;
> +            mbuf->outer_l3_len = 0;
> +        }
> +    }

This hunk above more or less reverts 5d11c47d3ebe ("userspace: Enable IP checksum offloading by default.").
So I suspect it breaks IP checksum when OVS only requests it (and no L4).

I get your meaning. I will modify the condition statement.

>
>      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
>          struct tcp_header *th = dp_packet_l4(pkt);
>
>          if (!th) {
> -            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
> -                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
> +            VLOG_WARN_RL(&rl,
> +            "%s: TCP Segmentation without L4 header,pkt len: %"PRIu32"",
> +                dev->up.name, mbuf->pkt_len);

Whitespace change damage? if so, please drop this part.

Ok.


>              return false;
>          }
>
> -        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +        if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE |
> +            RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
> +            mbuf->tso_segsz  = dev->mtu - mbuf->l2_len - mbuf->l3_len -
> +            mbuf->l4_len - mbuf->outer_l3_len;
> +        } else {
> +            mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
> +            mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
> +        }
> +
>          mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
> -        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
>
>          if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
>              mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
>          }
>      }
> +
> +    if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK)
> +        && (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4))
> +        mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;

This looks really strange, please explain why L4 offloads require ip checksumming.

This statement is to make sure not missing RTE_MBUF_F_TX_IP_CKSUM. 
Since ip checksum is enabled default, this can be removed.


> +
>      return true;
>  }
>

I did not look at the rest of the implementation for now.



--
David Marchand


Thanks a lot, David.


Best regards
Dexia
David Marchand Sept. 27, 2023, 3:43 p.m. UTC | #5
On Wed, Sep 27, 2023 at 11:29 AM Dexia Li <dexia.li@jaguarmicro.com> wrote:
> > >  static inline uint32_t *
> > >  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -615,9
> > > +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
> > >       * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
> > >       * loss of accuracy in assigning 'v' to 'data_len'.
> > >       */
> > > -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> > > -    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
> > > -                                      * this segment. */
> > > +    /* Current seg length. */
> > > +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);
> >
> > What is this change for?
>
> For multi-seg packets, pkt_len and data_len of the first seg are resized because tunnel encap.

OVS userspace (datapath) code does not support multi segments packets.
Besides, the default headroom of a dp-packet is enough for most tunnel headers.
"Non DPDK" dp-packet can be adjusted dynamically with
dp_packet_resize(), only DPDK dp-packet has some limitation (for which
I have a pending patch).


> The 128 Bytes headroom are reserved between buff_addr and data_off of mbuf. Then data_off move forward
> for the len of tunnel encap. So the first seg of the multi-seg packet should resize for data_len.
> At this time, the data_len of the first seg is 2048(RTE_MBUF_DEFAULT_DATAROOM) plus tunnel encap len.
> Before the resize, the data_len of the first seg is 2048. The dataroom is full of the first seg.
>
> It will result in tunnel packets tso error if the data_len of the first seg equals pkt_len. In my test,
> This will result in the last packet of seg packets can not appear.

I am not following.
There is no first segment to consider because there is only one
segment/buffer in OVS dp-packets.

When encapsulating, OVS checks if current headroom is enough. If not,
it allocates a bigger buffer and copies data into it.

In vxlan tunnel case,
netdev_push_header
 - netdev_tnl_push_udp_header
  - netdev_tnl_push_ip_header
   - dp_packet_push_uninit
    - dp_packet_prealloc_headroom(b, size);
     - dp_packet_resize(b, MAX(size, 64), dp_packet_tailroom(b));
      - https://github.com/openvswitch/ovs/blob/master/lib/dp-packet.c#L253
    - dp_packet_set_data(b, (char*)dp_packet_data(b) - size);
    - dp_packet_set_size(b, dp_packet_size(b) + size);

For DPDK-source dp-packets, as long as the headroom is big enough, the
current code should handle this situation fine.

If the headroom is not big enough, well, you may have a look at below
link, but that required at least 3 levels of encapsulations iirc.
https://patchwork.ozlabs.org/project/openvswitch/patch/20220318153339.31083-1-david.marchand@redhat.com/


Did I miss something?
Dexia Li Sept. 28, 2023, 12:03 a.m. UTC | #6
-----邮件原件-----
发件人: David Marchand <david.marchand@redhat.com> 
发送时间: 2023年9月27日 23:43
收件人: Dexia Li <dexia.li@jaguarmicro.com>
抄送: ovs-dev@openvswitch.org; i.maximets@ovn.org; Qingmin Liu <qingmin.liu@jaguarmicro.com>; Joey Xing <joey.xing@jaguarmicro.com>; Mike Pattrick <mkp@redhat.com>
主题: Re: [PATCH v3] netdev-dpdk: Add vxlan and geneve tunnel tso.

On Wed, Sep 27, 2023 at 11:29 AM Dexia Li <dexia.li@jaguarmicro.com> wrote:
> > >  static inline uint32_t *
> > >  dp_packet_ol_flags_ptr(const struct dp_packet *b)  { @@ -615,9
> > > +673,10 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v)
> > >       * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
> > >       * loss of accuracy in assigning 'v' to 'data_len'.
> > >       */
> > > -    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
> > > -    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
> > > -                                      * this segment. */
> > > +    /* Current seg length. */
> > > +    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);
> >
> > What is this change for?
>
> For multi-seg packets, pkt_len and data_len of the first seg are resized because tunnel encap.

OVS userspace (datapath) code does not support multi segments packets.
Besides, the default headroom of a dp-packet is enough for most tunnel headers.
"Non DPDK" dp-packet can be adjusted dynamically with dp_packet_resize(), only DPDK dp-packet has some limitation (for which I have a pending patch).


> The 128 Bytes headroom are reserved between buff_addr and data_off of 
> mbuf. Then data_off move forward for the len of tunnel encap. So the first seg of the multi-seg packet should resize for data_len.
> At this time, the data_len of the first seg is 2048(RTE_MBUF_DEFAULT_DATAROOM) plus tunnel encap len.
> Before the resize, the data_len of the first seg is 2048. The dataroom is full of the first seg.
>
> It will result in tunnel packets tso error if the data_len of the 
> first seg equals pkt_len. In my test, This will result in the last packet of seg packets can not appear.

I am not following.
There is no first segment to consider because there is only one segment/buffer in OVS dp-packets.

When encapsulating, OVS checks if current headroom is enough. If not, it allocates a bigger buffer and copies data into it.

In vxlan tunnel case,
netdev_push_header
 - netdev_tnl_push_udp_header
  - netdev_tnl_push_ip_header
   - dp_packet_push_uninit
    - dp_packet_prealloc_headroom(b, size);
     - dp_packet_resize(b, MAX(size, 64), dp_packet_tailroom(b));
      - https://github.com/openvswitch/ovs/blob/master/lib/dp-packet.c#L253
    - dp_packet_set_data(b, (char*)dp_packet_data(b) - size);
    - dp_packet_set_size(b, dp_packet_size(b) + size);

For DPDK-source dp-packets, as long as the headroom is big enough, the current code should handle this situation fine.

If the headroom is not big enough, well, you may have a look at below link, but that required at least 3 levels of encapsulations iirc.
https://patchwork.ozlabs.org/project/openvswitch/patch/20220318153339.31083-1-david.marchand@redhat.com/


Did I miss something?

No, I did not make it clear. we can look at the graph in the attachment
When using dpdk dp packet, dp_packet_set_size will set wrong data_len of first seg of rte mbuf.
It will result in tso error.


--
David Marchand
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 70ddf8aa4..4b5883dba 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -86,22 +86,43 @@  enum dp_packet_offload_mask {
     DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
     /* Offload IP checksum. */
     DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000),
+    /* Offload packet is tunnel GENEVE. */
+    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_GENEVE,
+                RTE_MBUF_F_TX_TUNNEL_GENEVE, 0x2000),
+    /* Offload packet is tunnel VXLAN. */
+    DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN,
+                RTE_MBUF_F_TX_TUNNEL_VXLAN, 0x4000),
+    /* Offload tunnel packet, out is ipv4 */
+    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IPV4,
+                RTE_MBUF_F_TX_OUTER_IPV4, 0x8000),
+    /* Offload TUNNEL out ipv4 checksum */
+    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_IP_CKSUM,
+                RTE_MBUF_F_TX_OUTER_IP_CKSUM, 0x10000),
+    /* Offload TUNNEL out udp checksum */
+    DEF_OL_FLAG(DP_PACKET_OL_TX_OUTER_UDP_CKSUM,
+                RTE_MBUF_F_TX_OUTER_UDP_CKSUM, 0x20000),
+
     /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
 };
 
-#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH         | \
-                                     DP_PACKET_OL_FLOW_MARK        | \
-                                     DP_PACKET_OL_RX_L4_CKSUM_BAD  | \
-                                     DP_PACKET_OL_RX_IP_CKSUM_BAD  | \
-                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD | \
-                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD | \
-                                     DP_PACKET_OL_TX_TCP_SEG       | \
-                                     DP_PACKET_OL_TX_IPV4          | \
-                                     DP_PACKET_OL_TX_IPV6          | \
-                                     DP_PACKET_OL_TX_TCP_CKSUM     | \
-                                     DP_PACKET_OL_TX_UDP_CKSUM     | \
-                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
-                                     DP_PACKET_OL_TX_IP_CKSUM)
+#define DP_PACKET_OL_SUPPORTED_MASK (DP_PACKET_OL_RSS_HASH           | \
+                                     DP_PACKET_OL_FLOW_MARK          | \
+                                     DP_PACKET_OL_RX_L4_CKSUM_BAD    | \
+                                     DP_PACKET_OL_RX_IP_CKSUM_BAD    | \
+                                     DP_PACKET_OL_RX_L4_CKSUM_GOOD   | \
+                                     DP_PACKET_OL_RX_IP_CKSUM_GOOD   | \
+                                     DP_PACKET_OL_TX_TCP_SEG         | \
+                                     DP_PACKET_OL_TX_IPV4            | \
+                                     DP_PACKET_OL_TX_IPV6            | \
+                                     DP_PACKET_OL_TX_TCP_CKSUM       | \
+                                     DP_PACKET_OL_TX_UDP_CKSUM       | \
+                                     DP_PACKET_OL_TX_SCTP_CKSUM      | \
+                                     DP_PACKET_OL_TX_IP_CKSUM        | \
+                                     DP_PACKET_OL_TX_TUNNEL_GENEVE   | \
+                                     DP_PACKET_OL_TX_TUNNEL_VXLAN    | \
+                                     DP_PACKET_OL_TX_OUTER_IPV4      | \
+                                     DP_PACKET_OL_TX_OUTER_IP_CKSUM  | \
+                                     DP_PACKET_OL_TX_OUTER_UDP_CKSUM)
 
 #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
                                  DP_PACKET_OL_TX_UDP_CKSUM | \
@@ -535,6 +556,25 @@  dp_packet_get_nd_payload(const struct dp_packet *b)
 }
 
 #ifdef DPDK_NETDEV
+static inline void
+dp_packet_set_l2_len(struct dp_packet *b, size_t l2_len)
+{
+    b->mbuf.l2_len = l2_len;
+}
+
+static inline void
+dp_packet_set_l3_len(struct dp_packet *b, size_t l3_len)
+{
+    b->mbuf.l3_len = l3_len;
+}
+
+static inline void
+dp_packet_set_l4_len(struct dp_packet *b, size_t l4_len)
+{
+    b->mbuf.l4_len = l4_len;
+}
+
+
 static inline uint64_t *
 dp_packet_ol_flags_ptr(const struct dp_packet *b)
 {
@@ -554,6 +594,24 @@  dp_packet_flow_mark_ptr(const struct dp_packet *b)
 }
 
 #else
+static inline void
+dp_packet_set_l2_len(struct dp_packet *b OVS_UNUSED, size_t l2_len OVS_UNUSED)
+{
+    /* There are no implementation */
+}
+
+static inline void
+dp_packet_set_l3_len(struct dp_packet *b OVS_UNUSED, size_t l3_len OVS_UNUSED)
+{
+    /* There are no implementation */
+}
+
+static inline void
+dp_packet_set_l4_len(struct dp_packet *b OVS_UNUSED, size_t l4_len OVS_UNUSED)
+{
+    /* There are no implementation */
+}
+
 static inline uint32_t *
 dp_packet_ol_flags_ptr(const struct dp_packet *b)
 {
@@ -615,9 +673,10 @@  dp_packet_set_size(struct dp_packet *b, uint32_t v)
      * (and thus 'v') will always be <= UINT16_MAX; this means that there is no
      * loss of accuracy in assigning 'v' to 'data_len'.
      */
-    b->mbuf.data_len = (uint16_t)v;  /* Current seg length. */
-    b->mbuf.pkt_len = v;             /* Total length of all segments linked to
-                                      * this segment. */
+    /* Current seg length. */
+    b->mbuf.data_len += (uint16_t)(v - b->mbuf.pkt_len);
+    /* Total length of all segments linked to this segment. */
+    b->mbuf.pkt_len = v;
 }
 
 static inline uint16_t
@@ -1030,6 +1089,43 @@  dp_packet_hwol_l4_is_sctp(struct dp_packet *b)
             DP_PACKET_OL_TX_SCTP_CKSUM;
 }
 
+/* Returns 'true' if packet 'b' is marked for tunnel GENEVE
+ * checksum offloading. */
+static inline bool
+dp_packet_hwol_is_tunnel_geneve(struct dp_packet *b)
+{
+    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_GENEVE);
+}
+
+/* Returns 'true' if packet 'b' is marked for tunnel VXLAN
+ * checksum offloading. */
+static inline bool
+dp_packet_hwol_is_tunnel_vxlan(struct dp_packet *b)
+{
+    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TUNNEL_VXLAN);
+}
+
+/* Returns 'true' if packet 'b' is marked for out ipv4. */
+static inline bool
+dp_packet_hwol_is_outer_ipv4(struct dp_packet *b)
+{
+    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IPV4);
+}
+
+/* Returns 'true' if packet 'b' is marked for out ipv4 csum offload. */
+static inline bool
+dp_packet_hwol_is_outer_ipv4_cksum(struct dp_packet *b)
+{
+    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_IP_CKSUM);
+}
+
+/* Returns 'true' if packet 'b' is marked for out udp csum offload. */
+static inline bool
+dp_packet_hwol_is_outer_UDP_cksum(struct dp_packet *b)
+{
+    return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_OUTER_UDP_CKSUM);
+}
+
 static inline void
 dp_packet_hwol_reset_tx_l4_csum(struct dp_packet *p)
 {
@@ -1105,6 +1201,43 @@  dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
     *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
 }
 
+/* Mark packet 'b' for tunnel geneve offloading.  It implies that
+ * the packet 'b' is marked for tunnel geneve offloading. */
+static inline void
+dp_packet_hwol_set_tunnel_geneve(struct dp_packet *b)
+{
+    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_GENEVE;
+}
+
+/* Mark packet 'b' for tunnel vxlan offloading.  It implies that
+ * the packet 'b' is marked for tunnel vxlan offloading. */
+static inline void
+dp_packet_hwol_set_tunnel_vxlan(struct dp_packet *b)
+{
+    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TUNNEL_VXLAN;
+}
+
+/* Mark packet 'b' for out ipv4 packet. */
+static inline void
+dp_packet_hwol_set_outer_ipv4(struct dp_packet *b)
+{
+    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IPV4;
+}
+
+/* Mark packet 'b' for out ipv4 csum offloading. */
+static inline void
+dp_packet_hwol_set_outer_ipv4_csum(struct dp_packet *b)
+{
+    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_IP_CKSUM;
+}
+
+/* Mark packet 'b' for out udp csum offloading. */
+static inline void
+dp_packet_hwol_set_outer_udp_csum(struct dp_packet *b)
+{
+    *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_OUTER_UDP_CKSUM;
+}
+
 /* Returns 'true' if the IP header has good integrity and the
  * checksum in it is complete. */
 static inline bool
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 55700250d..fbe46fc4b 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -416,6 +416,10 @@  enum dpdk_hw_ol_features {
     NETDEV_TX_UDP_CKSUM_OFFLOAD = 1 << 5,
     NETDEV_TX_SCTP_CKSUM_OFFLOAD = 1 << 6,
     NETDEV_TX_TSO_OFFLOAD = 1 << 7,
+    NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD = 1 << 8,
+    NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD = 1 << 9,
+    NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD = 1 << 10,
+    NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD = 1 << 11,
 };
 
 enum dpdk_rx_steer_flags {
@@ -1075,6 +1079,14 @@  netdev_dpdk_update_netdev_flags(struct netdev_dpdk *dev)
                                    NETDEV_TX_OFFLOAD_SCTP_CKSUM);
     netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_TSO_OFFLOAD,
                                    NETDEV_TX_OFFLOAD_TCP_TSO);
+    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD,
+                                   NETDEV_TX_VXLAN_TNL_TSO);
+    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD,
+                                   NETDEV_TX_GENEVE_TNL_TSO);
+    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD,
+                                   NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
+    netdev_dpdk_update_netdev_flag(dev, NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD,
+                                   NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
 }
 
 static int
@@ -1129,6 +1141,23 @@  dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq)
         conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_TCP_TSO;
     }
 
+    if (dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) {
+        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO;
+    }
+
+    if (dev->hw_ol_features & NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD) {
+        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
+    }
+
+    if (dev->hw_ol_features & NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD) {
+        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+    }
+
+    if (dev->hw_ol_features & NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD) {
+        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;
+    }
+
+
     /* Limit configured rss hash functions to only those supported
      * by the eth device. */
     conf.rx_adv_conf.rss_conf.rss_hf &= info.flow_type_rss_offloads;
@@ -1346,6 +1375,18 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
         dev->hw_ol_features &= ~NETDEV_TX_SCTP_CKSUM_OFFLOAD;
     }
 
+    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_IPV4_CKSUM) {
+        dev->hw_ol_features |= NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
+    } else {
+        dev->hw_ol_features &= ~NETDEV_TX_OUTER_IP_CKSUM_OFFLOAD;
+    }
+
+    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM) {
+        dev->hw_ol_features |= NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
+    } else {
+        dev->hw_ol_features &= ~NETDEV_TX_OUTER_UDP_CKSUM_OFFLOAD;
+    }
+
     dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
     if (userspace_tso_enabled()) {
         if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_TCP_TSO) {
@@ -1354,6 +1395,20 @@  dpdk_eth_dev_init(struct netdev_dpdk *dev)
             VLOG_WARN("%s: Tx TSO offload is not supported.",
                       netdev_get_name(&dev->up));
         }
+
+        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO) {
+            dev->hw_ol_features |= NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD;
+        } else {
+            VLOG_WARN("%s: Tx Vxlan tunnel TSO offload is not supported.",
+                      netdev_get_name(&dev->up));
+        }
+
+        if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO) {
+            dev->hw_ol_features |= NETDEV_TX_GENEVE_TNL_TSO_OFFLOAD;
+        } else {
+            VLOG_WARN("%s: Tx Geneve tunnel TSO offload is not supported.",
+                      netdev_get_name(&dev->up));
+        }
     }
 
     n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
@@ -2445,29 +2500,56 @@  netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf)
         return true;
     }
 
-    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
-    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
-    mbuf->l4_len = 0;
-    mbuf->outer_l2_len = 0;
-    mbuf->outer_l3_len = 0;
+    /* If packet is vxlan or geneve tunnel packet, calculate outer
+     * l2 len and outer l3 len. Inner l2/l3/l4 len are calculated
+     * before. */
+    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
+        if (mbuf->ol_flags &
+            (RTE_MBUF_F_TX_TUNNEL_GENEVE | RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
+            mbuf->outer_l2_len = (char *) dp_packet_l3(pkt) -
+                     (char *) dp_packet_eth(pkt);
+            mbuf->outer_l3_len = (char *) dp_packet_l4(pkt) -
+                     (char *) dp_packet_l3(pkt);
+        } else {
+            mbuf->l2_len = (char *) dp_packet_l3(pkt) -
+                   (char *) dp_packet_eth(pkt);
+            mbuf->l3_len = (char *) dp_packet_l4(pkt) -
+                   (char *) dp_packet_l3(pkt);
+            mbuf->outer_l2_len = 0;
+            mbuf->outer_l3_len = 0;
+        }
+    }
 
     if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
         struct tcp_header *th = dp_packet_l4(pkt);
 
         if (!th) {
-            VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header"
-                         " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len);
+            VLOG_WARN_RL(&rl,
+            "%s: TCP Segmentation without L4 header,pkt len: %"PRIu32"",
+                dev->up.name, mbuf->pkt_len);
             return false;
         }
 
-        mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
+        if (mbuf->ol_flags & (RTE_MBUF_F_TX_TUNNEL_GENEVE |
+            RTE_MBUF_F_TX_TUNNEL_VXLAN)) {
+            mbuf->tso_segsz  = dev->mtu - mbuf->l2_len - mbuf->l3_len -
+            mbuf->l4_len - mbuf->outer_l3_len;
+        } else {
+            mbuf->l4_len = TCP_OFFSET(th->tcp_ctl) * 4;
+            mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
+        }
+
         mbuf->ol_flags |= RTE_MBUF_F_TX_TCP_CKSUM;
-        mbuf->tso_segsz = dev->mtu - mbuf->l3_len - mbuf->l4_len;
 
         if (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4) {
             mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
         }
     }
+
+    if ((mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK)
+        && (mbuf->ol_flags & RTE_MBUF_F_TX_IPV4))
+        mbuf->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
+
     return true;
 }
 
diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
index 715bbab2b..427f5f8b9 100644
--- a/lib/netdev-native-tnl.c
+++ b/lib/netdev-native-tnl.c
@@ -60,6 +60,12 @@  static struct vlog_rate_limit err_rl = VLOG_RATE_LIMIT_INIT(60, 5);
 uint16_t tnl_udp_port_min = 32768;
 uint16_t tnl_udp_port_max = 61000;
 
+static void
+dp_packet_tnl_ol_send_prepare(const struct netdev *netdev,
+                             struct dp_packet *packet,
+                             uint64_t netdev_ol_flags,
+                             const struct ovs_action_push_tnl *data);
+
 void *
 netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
                   unsigned int *hlen)
@@ -150,8 +156,9 @@  netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
  *
  * Return pointer to the L4 header added to 'packet'. */
 void *
-netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
-                          int size, int *ip_tot_size, ovs_be32 ipv6_label)
+netdev_tnl_push_ip_header(uint64_t netdev_ol_flags, struct dp_packet *packet,
+                          const void *header, int size,
+                          int *ip_tot_size, ovs_be32 ipv6_label)
 {
     struct eth_header *eth;
     struct ip_header *ip;
@@ -178,6 +185,20 @@  netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
     } else {
         ip = netdev_tnl_ip_hdr(eth);
         ip->ip_tot_len = htons(*ip_tot_size);
+        /* if support out ip csum offload, set relative flags.
+         * otherwise postpone csum to when the packet is pushed to the port. */
+        if ((dp_packet_hwol_is_tunnel_geneve(packet) ||
+            dp_packet_hwol_is_tunnel_vxlan(packet)) &&
+            (netdev_ol_flags & NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM)) {
+            dp_packet_hwol_set_outer_ipv4(packet);
+            dp_packet_hwol_set_outer_ipv4_csum(packet);
+        } else {
+            /* offload tunnel out ipv4 checksum as normal ipv4 csum.
+             * Postpone checksum to when the packet is pushed to the port. */
+            dp_packet_hwol_set_tx_ipv4(packet);
+            dp_packet_hwol_set_tx_ip_csum(packet);
+            dp_packet_ol_reset_ip_csum_good(packet);
+        }
         /* Postpone checksum to when the packet is pushed to the port. */
         dp_packet_hwol_set_tx_ipv4(packet);
         dp_packet_hwol_set_tx_ip_csum(packet);
@@ -225,23 +246,88 @@  udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
     return udp + 1;
 }
 
+/* Calculate inner l2 l3 l4 len as tunnel outer header is not
+ * encapsulated now. */
+static void
+dp_packet_tnl_ol_send_prepare(const struct netdev *netdev,
+                             struct dp_packet *packet,
+                             uint64_t netdev_ol_flags,
+                             const struct ovs_action_push_tnl *data)
+{
+    struct udp_header *udp = NULL;
+    uint8_t opt_len = 0;
+    struct eth_header *eth = NULL;
+    struct ip_header *ip = NULL;
+    struct genevehdr *gnh = NULL;
+
+    /* l2 l3 l4 len refer to inner len, tunnel outer
+     * header is not encapsulated here. */
+    if (dp_packet_hwol_l4_mask(packet)) {
+        ip = dp_packet_l3(packet);
+
+        if (ip->ip_proto == IPPROTO_TCP) {
+            struct tcp_header *th = dp_packet_l4(packet);
+            dp_packet_set_l4_len(packet, TCP_OFFSET(th->tcp_ctl) * 4);
+        } else if (ip->ip_proto == IPPROTO_UDP) {
+            dp_packet_set_l4_len(packet, UDP_HEADER_LEN);
+        } else if (ip->ip_proto == IPPROTO_SCTP) {
+            dp_packet_set_l4_len(packet, SCTP_HEADER_LEN);
+        }
+
+        dp_packet_set_l3_len(packet, (char *) dp_packet_l4(packet) -
+                              (char *) dp_packet_l3(packet));
+
+        /* Attention please, tunnel inner l2 len is consist of udp header
+         * len and tunnel header len and inner l2 len. */
+        if (!strcmp(netdev_get_type(netdev), "geneve")) {
+            eth = (struct eth_header *)(data->header);
+            ip = (struct ip_header *)(eth + 1);
+            udp = (struct udp_header *)(ip + 1);
+            gnh = (struct genevehdr *)(udp + 1);
+            opt_len = gnh->opt_len * 4;
+            if (netdev_ol_flags & NETDEV_TX_GENEVE_TNL_TSO) {
+                dp_packet_hwol_set_tunnel_geneve(packet);
+                dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) -
+                                  (char *) dp_packet_eth(packet) +
+                                  GENEVE_BASE_HLEN + opt_len);
+            }
+        } else if (!strcmp(netdev_get_type(netdev), "vxlan")) {
+            if (netdev_ol_flags & NETDEV_TX_VXLAN_TNL_TSO) {
+                dp_packet_hwol_set_tunnel_vxlan(packet);
+                dp_packet_set_l2_len(packet, (char *) dp_packet_l3(packet) -
+                                  (char *) dp_packet_eth(packet) +
+                                  VXLAN_HLEN);
+            }
+        }
+    }
+}
+
 void
-netdev_tnl_push_udp_header(const struct netdev *netdev OVS_UNUSED,
+netdev_tnl_push_udp_header(const struct netdev *netdev,
                            struct dp_packet *packet,
                            const struct ovs_action_push_tnl *data)
 {
     struct udp_header *udp;
     int ip_tot_size;
 
-    udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
+    dp_packet_tnl_ol_send_prepare(netdev, packet, netdev->ol_flags, data);
+
+    udp = netdev_tnl_push_ip_header(netdev->ol_flags, packet, data->header,
+                                    data->header_len,
                                     &ip_tot_size, 0);
 
     /* set udp src port */
     udp->udp_src = netdev_tnl_get_src_port(packet);
     udp->udp_len = htons(ip_tot_size);
 
-    /* Postpone checksum to the egress netdev. */
-    dp_packet_hwol_set_csum_udp(packet);
+    /* if netdev support outer udp csum offload, positon relative bit.
+     * otherwise postpone checksum to the egress netdev. */
+    if (netdev->ol_flags & NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM) {
+        dp_packet_hwol_set_outer_udp_csum(packet);
+    } else {
+        udp->udp_csum = 0;
+    }
+
     if (udp->udp_csum) {
         dp_packet_ol_reset_l4_csum_good(packet);
     } else {
@@ -449,7 +535,7 @@  netdev_gre_push_header(const struct netdev *netdev,
     struct gre_base_hdr *greh;
     int ip_tot_size;
 
-    greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
+    greh = netdev_tnl_push_ip_header(0, packet, data->header, data->header_len,
                                      &ip_tot_size, 0);
 
     if (greh->flags & htons(GRE_CSUM)) {
@@ -597,7 +683,7 @@  netdev_erspan_push_header(const struct netdev *netdev,
     struct erspan_md2 *md2;
     int ip_tot_size;
 
-    greh = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
+    greh = netdev_tnl_push_ip_header(0, packet, data->header, data->header_len,
                                      &ip_tot_size, 0);
 
     /* update GRE seqno */
@@ -771,7 +857,7 @@  netdev_gtpu_push_header(const struct netdev *netdev,
     unsigned int payload_len;
 
     payload_len = dp_packet_size(packet);
-    udp = netdev_tnl_push_ip_header(packet, data->header, data->header_len,
+    udp = netdev_tnl_push_ip_header(0, packet, data->header, data->header_len,
                                     &ip_tot_size, 0);
     udp->udp_src = netdev_tnl_get_src_port(packet);
     udp->udp_len = htons(ip_tot_size);
@@ -919,7 +1005,7 @@  netdev_srv6_push_header(const struct netdev *netdev OVS_UNUSED,
         break;
     }
 
-    netdev_tnl_push_ip_header(packet, data->header,
+    netdev_tnl_push_ip_header(0, packet, data->header,
                               data->header_len, &ip_tot_size, ipv6_label);
 }
 
diff --git a/lib/netdev-native-tnl.h b/lib/netdev-native-tnl.h
index eb55dd041..d95a34b90 100644
--- a/lib/netdev-native-tnl.h
+++ b/lib/netdev-native-tnl.h
@@ -138,8 +138,9 @@  void *
 netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
                              unsigned int *hlen);
 void *
-netdev_tnl_push_ip_header(struct dp_packet *packet, const void *header,
-                          int size, int *ip_tot_size, ovs_be32 ipv6_label);
+netdev_tnl_push_ip_header(uint64_t netdev_ol_flags, struct dp_packet *packet,
+                          const void *header, int size,
+                          int *ip_tot_size, ovs_be32 ipv6_label);
 void
 netdev_tnl_egress_port_range(struct unixctl_conn *conn, int argc,
                              const char *argv[], void *aux OVS_UNUSED);
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index a7393c7ce..22840a058 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -43,6 +43,10 @@  enum netdev_ol_flags {
     NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2,
     NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3,
     NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4,
+    NETDEV_TX_VXLAN_TNL_TSO = 1 << 5,
+    NETDEV_TX_GENEVE_TNL_TSO = 1 << 6,
+    NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM = 1 << 7,
+    NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM = 1 << 8,
 };
 
 /* A network device (e.g. an Ethernet device).
diff --git a/lib/netdev.c b/lib/netdev.c
index e5ac7713d..7120983ff 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -953,16 +953,26 @@  netdev_push_header(const struct netdev *netdev,
     size_t i, size = dp_packet_batch_size(batch);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, batch) {
-        if (OVS_UNLIKELY(dp_packet_hwol_is_tso(packet))) {
-            COVERAGE_INC(netdev_push_header_drops);
-            dp_packet_delete(packet);
-            VLOG_WARN_RL(&rl, "%s: Tunneling packets with TSO is "
-                         "not supported: packet dropped",
-                         netdev_get_name(netdev));
-        } else {
-            /* The packet is going to be encapsulated and there is
-             * no support yet for inner network header csum offloading. */
-            dp_packet_ol_send_prepare(packet, 0);
+    if (OVS_UNLIKELY(strcmp(netdev_get_type(netdev), "vxlan") &&
+        strcmp(netdev_get_type(netdev), "geneve") &&
+        dp_packet_hwol_is_tso(packet))) {
+        COVERAGE_INC(netdev_push_header_drops);
+        dp_packet_delete(packet);
+        VLOG_WARN_RL(&rl,
+                     "%s: Tunneling packets with tso HW offload flags"
+                     "is not supported: packet dropped",
+        netdev_get_name(netdev));
+    } else {
+            /* The packet is going to be encapsulated, geneve/vxlan inner
+             * network header csum offloading and tso are supported. Other
+             * tunnel inner csum and tso are not supported yet then we
+             * offer the software fallback.*/
+            if (!strcmp(netdev_get_type(netdev), "vxlan") ||
+                !strcmp(netdev_get_type(netdev), "geneve")) {
+                dp_packet_ol_send_prepare(packet, netdev->ol_flags);
+            } else {
+                dp_packet_ol_send_prepare(packet, 0);
+            }
 
             netdev->netdev_class->push_header(netdev, packet, data);
 
@@ -1409,6 +1419,10 @@  netdev_get_status(const struct netdev *netdev, struct smap *smap)
         OL_ADD_STAT("udp_csum", NETDEV_TX_OFFLOAD_UDP_CKSUM);
         OL_ADD_STAT("sctp_csum", NETDEV_TX_OFFLOAD_SCTP_CKSUM);
         OL_ADD_STAT("tcp_seg", NETDEV_TX_OFFLOAD_TCP_TSO);
+        OL_ADD_STAT("vxlan_tso", NETDEV_TX_VXLAN_TNL_TSO);
+        OL_ADD_STAT("geneve_tso", NETDEV_TX_GENEVE_TNL_TSO);
+        OL_ADD_STAT("out_ip_csum", NETDEV_TX_OFFLOAD_OUTER_IP_CKSUM);
+        OL_ADD_STAT("out_udp_csum", NETDEV_TX_OFFLOAD_OUTER_UDP_CKSUM);
 #undef OL_ADD_STAT
 
         err = 0;
diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
index 85119fb81..0f6271760 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -658,11 +658,11 @@  OVS_VSWITCHD_START(
                      other-config:datapath-id=1234 fail-mode=secure])
 
 AT_CHECK([ovs-vsctl get interface p1 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl
-tx_ip_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false"
+tx_geneve_tso_offload="false", tx_ip_csum_offload="false", tx_out_ip_csum_offload="false", tx_out_udp_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false", tx_vxlan_tso_offload="false"
 ], [])
 
 AT_CHECK([ovs-vsctl get interface br0 status | sed -n 's/^{\(.*\).*}$/\1/p'], [0], [dnl
-tx_ip_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false"
+tx_geneve_tso_offload="false", tx_ip_csum_offload="false", tx_out_ip_csum_offload="false", tx_out_udp_csum_offload="false", tx_sctp_csum_offload="false", tx_tcp_csum_offload="false", tx_tcp_seg_offload="false", tx_udp_csum_offload="false", tx_vxlan_tso_offload="false"
 ], [])
 
 OVS_VSWITCHD_STOP