Message ID | 20200807105648.94860-3-yang_y_yi@163.com |
---|---|
State | Changes Requested |
Headers | show |
Series | userspace: enable VXLAN TSO, GSO and GRO | expand |
On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > From: Yi Yang <yangyi01@inspur.com> > > GSO(Generic Segment Offload) can segment large UDP > and TCP packet to small packets per MTU of destination > , especially for the case that physical NIC can't > do hardware offload VXLAN TSO and VXLAN UFO, GSO can > make sure userspace TSO can still work but not drop. > > In addition, GSO can help improve UDP performane when > UFO is enabled in VM. > > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is > done in Tx function of physical NIC. > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > --- > lib/dp-packet.h | 21 +++- > lib/netdev-dpdk.c | 358 +++++++++++++++++++++++++++++++++++++++++++++++++---- > lib/netdev-linux.c | 17 ++- > lib/netdev.c | 67 +++++++--- > 4 files changed, 417 insertions(+), 46 deletions(-) > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h > index 79895f2..c33868d 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -83,6 +83,8 @@ enum dp_packet_offload_mask { > DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800), > /* VXLAN TCP Segmentation Offload. */ > DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000), > + /* UDP Segmentation Offload. */ > + DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000), > /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */ > }; > > @@ -97,7 +99,8 @@ enum dp_packet_offload_mask { > DP_PACKET_OL_TX_IPV6 | \ > DP_PACKET_OL_TX_TCP_CKSUM | \ > DP_PACKET_OL_TX_UDP_CKSUM | \ > - DP_PACKET_OL_TX_SCTP_CKSUM) > + DP_PACKET_OL_TX_SCTP_CKSUM | \ > + DP_PACKET_OL_TX_UDP_SEG) > > #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ > DP_PACKET_OL_TX_UDP_CKSUM | \ > @@ -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b) > return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG); > } > > +/* Returns 'true' if packet 'b' is marked for UDP segmentation offloading. */ > +static inline bool > +dp_packet_hwol_is_uso(const struct dp_packet *b) > +{ > + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); > +} > + > /* Returns 'true' if packet 'b' is marked for IPv4 checksum offloading. */ > static inline bool > dp_packet_hwol_is_ipv4(const struct dp_packet *b) > @@ -1034,6 +1044,15 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) > *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; > } > > +/* Mark packet 'b' for UDP segmentation offloading. It implies that > + * either the packet 'b' is marked for IPv4 or IPv6 checksum offloading > + * and also for UDP checksum offloading. */ > +static inline void > +dp_packet_hwol_set_udp_seg(struct dp_packet *b) > +{ > + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; > +} > + > #ifdef DPDK_NETDEV > /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ > static inline void > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 30493ed..888a45e 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -38,13 +38,15 @@ > #include <rte_errno.h> > #include <rte_ethdev.h> > #include <rte_flow.h> > +#include <rte_gso.h> > +#include <rte_ip.h> > #include <rte_malloc.h> > #include <rte_mbuf.h> > #include <rte_meter.h> > #include <rte_pci.h> > #include <rte_version.h> > #include <rte_vhost.h> > -#include <rte_ip.h> > +#include <rte_ip_frag.h> > > #include "cmap.h" > #include "coverage.h" > @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t; > | DEV_TX_OFFLOAD_UDP_CKSUM \ > | DEV_TX_OFFLOAD_IPV4_CKSUM) > > +#define MAX_GSO_MBUFS 64 > > static const struct rte_eth_conf port_conf = { > .rxmode = { > @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev) > return ret; > } > > +static uint16_t > +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) > +{ > + if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) { > + return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr); > + } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ > + return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr); > + } > +} > + > /* Prepare the packet for HWOL. > * Return True if the packet is OK to continue. */ > static bool > @@ -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared > * outer_l2_len and outer_l3_len must be zeroed. > */ > - if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO) > - || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) > + if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) > && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > - + mbuf->l2_len))) { > + + mbuf->l2_len)) { > mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN; > mbuf->l2_len -= sizeof(struct udp_header) > + sizeof(struct vxlanhdr); > @@ -2249,7 +2261,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > /* inner IP checksum offload offload */ > mbuf->ol_flags |= PKT_TX_IP_CKSUM; > } > - } else if (mbuf->ol_flags & PKT_TX_L4_MASK) { > + } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) { > /* Handle VLAN TSO */ > /* no inner IP checksum for IPV6 */ > if (mbuf->ol_flags & PKT_TX_IPV4) { > @@ -2273,6 +2285,18 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); > mbuf->outer_l2_len = 0; > mbuf->outer_l3_len = 0; > + > + /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO > + * APIs, here is a place we can mark it. > + */ > + if ((mbuf->pkt_len > 1464) > + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { > + if (l4_proto == IPPROTO_UDP) { > + mbuf->ol_flags |= PKT_TX_UDP_SEG; > + } else if (l4_proto == IPPROTO_TCP) { > + mbuf->ol_flags |= PKT_TX_TCP_SEG; > + } > + } > } > > /* It is possible that l4_len isn't set for vhostuserclient */ > @@ -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; > } > > + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { > + return true; > + } > + > if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { > if (l4_proto != IPPROTO_UDP) { > VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header" > @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->ol_flags & PKT_TX_TCP_CKSUM) { > if (l4_proto != IPPROTO_TCP) { > VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" > - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); > + " pkt len: %"PRIu32" l4_proto = %d", > + dev->up.name, mbuf->pkt_len, l4_proto); > return false; > } > > - if (mbuf->pkt_len - mbuf->l2_len > 1450) { > + if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > + + mbuf->l2_len) { > dp_packet_hwol_set_tcp_seg(pkt); > } > > @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > } else { > mbuf->tso_segsz = 0; > } > + > + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > + /* PKT_TX_TCP_CKSUM must be cleaned for GSO because > + * tcp checksum only can be caculated by software for > + * GSO case. > + */ > + mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM; > + } > } > + > + /* UDP GSO if necessary */ > + if (l4_proto == IPPROTO_UDP) { > + /* VXLAN GSO can be done here */ > + if ((mbuf->ol_flags & PKT_TX_UDP_SEG) || > + (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + > + mbuf->l2_len))) { > + dp_packet_hwol_set_udp_seg(pkt); > + > + /* For UDP GSO, udp checksum must be calculated by software */ > + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { > + void *l3_hdr, *l4_hdr; > + struct rte_udp_hdr *udp_hdr; > + > + /* PKT_TX_UDP_CKSUM must be cleaned for GSO because > + * udp checksum only can be caculated by software for > + * GSO case. > + */ > + mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM; > + > + eth_hdr = (struct rte_ether_hdr *) > + ((uint8_t *)eth_hdr + mbuf->outer_l2_len + > + mbuf->outer_l3_len + > + sizeof(struct udp_header) + > + sizeof(struct vxlanhdr)); > + l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len - > + sizeof(struct udp_header) - > + sizeof(struct vxlanhdr); > + l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len; > + ip_hdr = (struct rte_ipv4_hdr *)l3_hdr; > + ip_hdr->hdr_checksum = 0; > + ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr); > + /* Don't touch UDP checksum if it is ip fragment */ > + if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) { > + udp_hdr = (struct rte_udp_hdr *)l4_hdr; > + udp_hdr->dgram_cksum = 0; > + udp_hdr->dgram_cksum = > + get_udptcp_checksum(l3_hdr, l4_hdr, > + eth_hdr->ether_type); > + } > + } > + > + /* FOR GSO, gso_size includes l2_len + l3_len */ > + mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + > + mbuf->l2_len; > + if (mbuf->tso_segsz > dev->mtu) { > + mbuf->tso_segsz = dev->mtu; > + } > + } > + } > + > return true; > } > > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > return cnt; > } > > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of > - * 'pkts', even in case of failure. > - * > - * Returns the number of packets that weren't transmitted. */ > static inline int > -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > - struct rte_mbuf **pkts, int cnt) > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > + struct rte_mbuf **pkts, int cnt) > { > uint32_t nb_tx = 0; > - uint16_t nb_tx_prep = cnt; > + uint32_t nb_tx_prep; > > - if (userspace_tso_enabled()) { > - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > - if (nb_tx_prep != cnt) { > - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, > - cnt, rte_strerror(rte_errno)); > - } > + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > + if (nb_tx_prep != cnt) { > + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > + "Only %u/%u are valid: %s", > + dev->up.name, nb_tx_prep, > + cnt, rte_strerror(rte_errno)); > } > > while (nb_tx != nb_tx_prep) { > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > return cnt - nb_tx; > } > > +static inline void > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. > +{ > + uint16_t l3_offset = mbuf->outer_l2_len + mbuf->outer_l3_len > + + mbuf->l2_len; > + struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *) > + (rte_pktmbuf_mtod(mbuf, char *) + l3_offset); > + struct rte_tcp_hdr *tcp_hdr; > + uint32_t l4_hdr_len; > + uint8_t *l4_hdr; > + struct rte_mbuf *next = mbuf->next; > + uint32_t cksum = 0; > + uint16_t l4_proto; > + uint32_t inner_cksum; > + > + l4_proto = ipv4_hdr->next_proto_id; > + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { > + return; > + } > + > + if (l4_proto == IPPROTO_TCP) { > + /* For TCP GSO, inner TCP header is in every seg, > + * TCP checksum has to be calculated by software. > + */ > + > + l4_hdr_len = mbuf->data_len - l3_offset > + - sizeof(struct rte_ipv4_hdr); > + l4_hdr = (uint8_t *)(ipv4_hdr + 1); > + tcp_hdr = (struct rte_tcp_hdr *)l4_hdr; > + tcp_hdr->cksum = 0; > + } > + > + /* Set inner ip checksum */ > + ipv4_hdr->hdr_checksum = 0; > + ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr); > + > + if (l4_proto == IPPROTO_TCP) { > + cksum = rte_raw_cksum(l4_hdr, l4_hdr_len); > + } else if (l4_proto == IPPROTO_UDP) { > + if (next == NULL) { > + /* It wasn't GSOed */ > + cksum = rte_raw_cksum(ipv4_hdr + 1, > + ntohs(ipv4_hdr->total_length) > + - sizeof(struct rte_ipv4_hdr)); > + } else { > + cksum = 0; > + } > + } > + > + /* It was GSOed */ > + while (next) { > + cksum += rte_raw_cksum(rte_pktmbuf_mtod(next, char *), next->data_len); > + next = next->next; > + } > + > + /* Save cksum to inner_cksum, outer udp checksum needs it */ > + inner_cksum = cksum; > + > + cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); > + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); > + cksum = (~cksum) & 0xffff; > + if (cksum == 0) { > + cksum = 0xffff; > + } > + > + /* Set inner TCP checksum */ > + if (l4_proto == IPPROTO_TCP) { > + tcp_hdr->cksum = (uint16_t)cksum; > + } > + > + /* Set outer udp checksum in case of VXLAN */ > + if (mbuf->outer_l2_len != 0) { > + ipv4_hdr = (struct rte_ipv4_hdr *) > + (rte_pktmbuf_mtod(mbuf, char *) + mbuf->outer_l2_len); > + struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *) > + (ipv4_hdr + 1); > + > + /* Set outer ip checksum */ > + ipv4_hdr->hdr_checksum = 0; > + ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr); > + > + udp_hdr->dgram_cksum = 0; > + cksum = rte_ipv4_phdr_cksum(ipv4_hdr, 0); > + cksum += rte_raw_cksum(udp_hdr, mbuf->l2_len + mbuf->l3_len); > + cksum += inner_cksum; > + if (l4_proto == IPPROTO_TCP) { > + cksum += tcp_hdr->cksum; > + } > + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); > + cksum = (~cksum) & 0xffff; > + if (cksum == 0) { > + cksum = 0xffff; > + } > + udp_hdr->dgram_cksum = (uint16_t)cksum; > + } > +} > + > +/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of > + * 'pkts', even in case of failure. > + * > + * Returns the number of packets that weren't transmitted. */ > +static inline int > +netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > + struct rte_mbuf **pkts, int cnt) > +{ > + uint32_t nb_tx = 0; > + int i; > + int ret; > + int failures = 0; > + > + if (userspace_tso_enabled()) { > + /* The best point to do gso */ > + struct rte_gso_ctx gso_ctx; > + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS]; > + int tx_start = -1; > + > + /* Setup gso context */ > + gso_ctx.direct_pool = dev->dpdk_mp->mp; > + gso_ctx.indirect_pool = dev->dpdk_mp->mp; > + > + /* Do GSO if needed */ > + for (i = 0; i < cnt; i++) { > + if (((pkts[i]->ol_flags & PKT_TX_UDP_SEG) && > + !(dev->hw_ol_features & DEV_TX_OFFLOAD_UDP_TSO)) || > + ((pkts[i]->ol_flags & PKT_TX_TCP_SEG) && > + ((!(dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) > + && (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN)) > + || !(dev->hw_ol_features & DEV_TX_OFFLOAD_TCP_TSO)))) { > + /* Send non GSO packets before pkts[i] */ > + if (tx_start != -1) { > + failures += __netdev_dpdk_eth_tx_burst( > + dev, qid, > + pkts + tx_start, > + i - tx_start); > + } > + tx_start = -1; > + > + gso_ctx.gso_types = 0; > + gso_ctx.gso_size = pkts[i]->tso_segsz; > + gso_ctx.flag = 0; > + if (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN) { > + gso_ctx.gso_types |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO; > + } > + if (pkts[i]->ol_flags & PKT_TX_UDP_SEG) { > + gso_ctx.gso_types |= DEV_TX_OFFLOAD_UDP_TSO; > + } else if (pkts[i]->ol_flags & PKT_TX_TCP_SEG) { > + gso_ctx.gso_types |= DEV_TX_OFFLOAD_TCP_TSO; > + pkts[i]->ol_flags &= ~PKT_TX_TCP_CKSUM; > + } > + ret = rte_gso_segment(pkts[i], /* packet to segment */ > + &gso_ctx, /* gso context */ > + /* gso output mbufs */ > + (struct rte_mbuf **)&gso_mbufs, > + MAX_GSO_MBUFS); > + if (ret < 0) { > + rte_pktmbuf_free(pkts[i]); > + } else { > + int j, k; > + struct rte_mbuf * next_part; > + nb_tx = ret; > + for (j = 0; j < nb_tx; j++) { > + set_multiseg_udptcp_cksum(gso_mbufs[j]); > + /* Clear them because of no offload */ > + gso_mbufs[j]->ol_flags = 0; > + gso_mbufs[j]->outer_l2_len = 0; > + gso_mbufs[j]->outer_l3_len = 0; > + gso_mbufs[j]->l2_len = 0; > + gso_mbufs[j]->l3_len = 0; > + gso_mbufs[j]->l4_len = 0; > + next_part = gso_mbufs[j]; > + for (k = 0; k < gso_mbufs[j]->nb_segs; k++) { > + next_part = next_part->next; > + } > + } > + __netdev_dpdk_eth_tx_burst(dev, qid, gso_mbufs, nb_tx); > + } > + continue; > + } > + if (tx_start == -1) { > + tx_start = i; > + } > + } > + > + if (tx_start != -1) { > + /* Send non GSO packets before pkts[i] */ > + failures += __netdev_dpdk_eth_tx_burst(dev, qid, pkts + tx_start, > + i - tx_start); > + } > + return failures; > + } > + > + return __netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt); > +} > + > static inline bool > netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter, > struct rte_meter_srtcm_profile *profile, > @@ -2786,10 +3064,24 @@ out: > } > } > > +struct shinfo_arg { > + void * buf; > + struct rte_mbuf *mbuf; > +}; > + > +/* For GSO case, the extended mbuf only can be freed by > + * netdev_dpdk_extbuf_free > + */ > static void > -netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) > +netdev_dpdk_extbuf_free(struct rte_mbuf *m, void *opaque) > { > - rte_free(opaque); > + struct shinfo_arg *arg = (struct shinfo_arg *)opaque; > + > + rte_free(arg->buf); > + if (m != arg->mbuf) { > + rte_pktmbuf_free(arg->mbuf); > + } > + free(arg); > } > > static struct rte_mbuf * > @@ -2821,8 +3113,11 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) > > /* Initialize shinfo. */ > if (shinfo) { > + struct shinfo_arg *arg = xmalloc(sizeof(struct shinfo_arg)); > + arg->buf = buf; > + arg->mbuf = pkt; > shinfo->free_cb = netdev_dpdk_extbuf_free; > - shinfo->fcb_opaque = buf; > + shinfo->fcb_opaque = arg; > rte_mbuf_ext_refcnt_set(shinfo, 1); > } else { > shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, > @@ -2852,6 +3147,10 @@ dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len) > return NULL; > } > > + if (unlikely(pkt->shinfo != NULL)) { > + pkt->shinfo = NULL; > + } > + > if (rte_pktmbuf_tailroom(pkt) >= data_len) { > return pkt; > } > @@ -5192,6 +5491,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) > int err; > uint64_t vhost_flags = 0; > uint64_t vhost_unsup_flags; > + uint64_t vhost_supported_flags; > bool zc_enabled; > > ovs_mutex_lock(&dev->mutex); > @@ -5277,6 +5577,16 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) > goto unlock; > } > > + err = rte_vhost_driver_get_features(dev->vhost_id, > + &vhost_supported_flags); > + if (err) { > + VLOG_ERR("rte_vhost_driver_get_features failed for " > + "vhost user client port: %s\n", dev->up.name); > + goto unlock; > + } > + VLOG_INFO("vhostuserclient port %s features: 0x%016lx", > + dev->up.name, vhost_supported_flags); > + > err = rte_vhost_driver_start(dev->vhost_id); > if (err) { > VLOG_ERR("rte_vhost_driver_start failed for vhost user " > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > index 9f830b4..557f139 100644 > --- a/lib/netdev-linux.c > +++ b/lib/netdev-linux.c > @@ -6566,6 +6566,16 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) > > l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4; > dp_packet_hwol_set_l4_len(b, l4_len); > + } else if (*l4proto == IPPROTO_UDP) { > + struct udp_header *udp_hdr = dp_packet_at(b, l2_len + l3_len, > + sizeof(struct udp_header)); > + > + if (!udp_hdr) { > + return -EINVAL; > + } > + > + l4_len = sizeof(struct udp_header); > + dp_packet_hwol_set_l4_len(b, l4_len); > } > > return 0; > @@ -6581,10 +6591,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) > return -EINVAL; > } > > - if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) { > - return 0; > - } > - > if (netdev_linux_parse_l2(b, &l4proto)) { > return -EINVAL; > } > @@ -6609,6 +6615,9 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) > || type == VIRTIO_NET_HDR_GSO_TCPV6) { > dp_packet_hwol_set_tcp_seg(b); > } > + if (type == VIRTIO_NET_HDR_GSO_UDP) { > + dp_packet_hwol_set_udp_seg(b); > + } > } > > return 0; > diff --git a/lib/netdev.c b/lib/netdev.c > index 64583d1..02f28c8 100644 > --- a/lib/netdev.c > +++ b/lib/netdev.c > @@ -825,23 +825,41 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, > { > uint64_t l4_mask; > > - if (dp_packet_hwol_is_vxlan_tcp_seg(packet) > - && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet)) > - && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) { > - /* Fall back to GSO in software. */ > - VLOG_ERR_BUF(errormsg, "No VXLAN TSO support"); > - return false; > - } > - > - if (dp_packet_hwol_is_tso(packet) > - && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > - /* Fall back to GSO in software. */ > - VLOG_ERR_BUF(errormsg, "No TSO support"); > - return false; > - } > - > + /* GSO can handle TSO by software even if device can't handle hardware > + * offload, so needn't check it here. > + */ > l4_mask = dp_packet_hwol_l4_mask(packet); > if (l4_mask) { > + /* Calculate checksum for VLAN TSO case when no hardware offload > + * feature is available. Note: for VXLAN TSO case, checksum has > + * been calculated before here, so it won't be done here again > + * because checksum flags in packet->m.ol_flags have been cleaned. > + */ > + if (dp_packet_hwol_l4_is_tcp(packet) > + && !dp_packet_hwol_is_vxlan_tcp_seg(packet) > + && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { > + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM; > + /* Only calculate TCP checksum for non-TSO packet, > + * it will be calculated after GSO for TSO packet. > + */ > + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) { > + calculate_tcpudp_checksum(packet); > + } > + return true; > + } else if (dp_packet_hwol_l4_is_udp(packet) > + && !dp_packet_hwol_is_vxlan_tcp_seg(packet) > + && !(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { > + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM; > + /* Only calculate UDP checksum for non-UFO packet, > + * it will be calculated immediately before GSO for > + * UFO packet. > + */ > + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) { > + calculate_tcpudp_checksum(packet); > + } > + return true; > + } > + > if (dp_packet_hwol_l4_is_tcp(packet)) { > if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { > /* Fall back to TCP csum in software. */ > @@ -1013,11 +1031,26 @@ netdev_push_header(const struct netdev *netdev, > /* VXLAN offload can't support udp checksum offload > * for inner udp packet, so udp checksum must be set > * before push header in order that outer checksum can > - * be set correctly. > + * be set correctly. But GSO code will set udp checksum > + * if packet->mbuf.ol_flags has DP_PACKET_OL_TX_UDP_SEG. > */ > if (dp_packet_hwol_l4_is_udp(packet)) { > packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM; > - calculate_tcpudp_checksum(packet); > + /* Only calculate UDP checksum for non-UFO packet, > + * it will be calculated immediately before GSO for > + * UFO packet. > + */ > + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) { > + calculate_tcpudp_checksum(packet); > + } > + } else if (dp_packet_hwol_l4_is_tcp(packet)) { > + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM; > + /* Only calculate TCP checksum for non-TSO packet, > + * it will be calculated after GSO for TSO packet. > + */ > + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) { > + calculate_tcpudp_checksum(packet); > + } > } > } > netdev->netdev_class->push_header(netdev, packet, data); >
-----邮件原件----- 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets 发送时间: 2020年10月27日 21:03 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org 抄送: fbl@sysclose.org; i.maximets@ovn.org 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > From: Yi Yang <yangyi01@inspur.com> > > GSO(Generic Segment Offload) can segment large UDP and TCP packet to > small packets per MTU of destination , especially for the case that > physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO > can make sure userspace TSO can still work but not drop. > > In addition, GSO can help improve UDP performane when UFO is enabled > in VM. > > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx > function of physical NIC. > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > --- > lib/dp-packet.h | 21 +++- > lib/netdev-dpdk.c | 358 > +++++++++++++++++++++++++++++++++++++++++++++++++---- > lib/netdev-linux.c | 17 ++- > lib/netdev.c | 67 +++++++--- > 4 files changed, 417 insertions(+), 46 deletions(-) > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 79895f2..c33868d > 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -83,6 +83,8 @@ enum dp_packet_offload_mask { > DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800), > /* VXLAN TCP Segmentation Offload. */ > DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, > 0x1000), > + /* UDP Segmentation Offload. */ > + DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000), > /* Adding new field requires adding to > DP_PACKET_OL_SUPPORTED_MASK. */ }; > > @@ -97,7 +99,8 @@ enum dp_packet_offload_mask { > DP_PACKET_OL_TX_IPV6 | \ > DP_PACKET_OL_TX_TCP_CKSUM | \ > DP_PACKET_OL_TX_UDP_CKSUM | \ > - DP_PACKET_OL_TX_SCTP_CKSUM) > + DP_PACKET_OL_TX_SCTP_CKSUM | \ > + DP_PACKET_OL_TX_UDP_SEG) > > #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ > DP_PACKET_OL_TX_UDP_CKSUM | \ @@ > -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b) > return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG); > } > > +/* Returns 'true' if packet 'b' is marked for UDP segmentation > +offloading. */ static inline bool dp_packet_hwol_is_uso(const struct > +dp_packet *b) { > + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); > +} > + > /* Returns 'true' if packet 'b' is marked for IPv4 checksum > offloading. */ static inline bool dp_packet_hwol_is_ipv4(const > struct dp_packet *b) @@ -1034,6 +1044,15 @@ > dp_packet_hwol_set_tcp_seg(struct dp_packet *b) > *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } > > +/* Mark packet 'b' for UDP segmentation offloading. It implies that > + * either the packet 'b' is marked for IPv4 or IPv6 checksum > +offloading > + * and also for UDP checksum offloading. */ static inline void > +dp_packet_hwol_set_udp_seg(struct dp_packet *b) { > + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; } > + > #ifdef DPDK_NETDEV > /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static > inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index > 30493ed..888a45e 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -38,13 +38,15 @@ > #include <rte_errno.h> > #include <rte_ethdev.h> > #include <rte_flow.h> > +#include <rte_gso.h> > +#include <rte_ip.h> > #include <rte_malloc.h> > #include <rte_mbuf.h> > #include <rte_meter.h> > #include <rte_pci.h> > #include <rte_version.h> > #include <rte_vhost.h> > -#include <rte_ip.h> > +#include <rte_ip_frag.h> > > #include "cmap.h" > #include "coverage.h" > @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t; > | DEV_TX_OFFLOAD_UDP_CKSUM \ > | DEV_TX_OFFLOAD_IPV4_CKSUM) > > +#define MAX_GSO_MBUFS 64 > > static const struct rte_eth_conf port_conf = { > .rxmode = { > @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev) > return ret; > } > > +static uint16_t > +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) { > + if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) { > + return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr); > + } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ > + return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr); > + } > +} > + > /* Prepare the packet for HWOL. > * Return True if the packet is OK to continue. */ static bool @@ > -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared > * outer_l2_len and outer_l3_len must be zeroed. > */ > - if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO) > - || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) > + if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) > && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > - + mbuf->l2_len))) { > + + mbuf->l2_len)) { > mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN; > mbuf->l2_len -= sizeof(struct udp_header) > + sizeof(struct vxlanhdr); @@ -2249,7 +2261,7 > @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > /* inner IP checksum offload offload */ > mbuf->ol_flags |= PKT_TX_IP_CKSUM; > } > - } else if (mbuf->ol_flags & PKT_TX_L4_MASK) { > + } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) { > /* Handle VLAN TSO */ > /* no inner IP checksum for IPV6 */ > if (mbuf->ol_flags & PKT_TX_IPV4) { @@ -2273,6 +2285,18 @@ > netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); > mbuf->outer_l2_len = 0; > mbuf->outer_l3_len = 0; > + > + /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO > + * APIs, here is a place we can mark it. > + */ > + if ((mbuf->pkt_len > 1464) > + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { > + if (l4_proto == IPPROTO_UDP) { > + mbuf->ol_flags |= PKT_TX_UDP_SEG; > + } else if (l4_proto == IPPROTO_TCP) { > + mbuf->ol_flags |= PKT_TX_TCP_SEG; > + } > + } > } > > /* It is possible that l4_len isn't set for vhostuserclient */ @@ > -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; > } > > + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { > + return true; > + } > + > if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { > if (l4_proto != IPPROTO_UDP) { > VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header" > @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > mbuf->ol_flags & PKT_TX_TCP_CKSUM) { > if (l4_proto != IPPROTO_TCP) { > VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" > - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); > + " pkt len: %"PRIu32" l4_proto = %d", > + dev->up.name, mbuf->pkt_len, l4_proto); > return false; > } > > - if (mbuf->pkt_len - mbuf->l2_len > 1450) { > + if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len > + + mbuf->l2_len) { > dp_packet_hwol_set_tcp_seg(pkt); > } > > @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > } else { > mbuf->tso_segsz = 0; > } > + > + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { > + /* PKT_TX_TCP_CKSUM must be cleaned for GSO because > + * tcp checksum only can be caculated by software for > + * GSO case. > + */ > + mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM; > + } > } > + > + /* UDP GSO if necessary */ > + if (l4_proto == IPPROTO_UDP) { > + /* VXLAN GSO can be done here */ > + if ((mbuf->ol_flags & PKT_TX_UDP_SEG) || > + (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + > + mbuf->l2_len))) { > + dp_packet_hwol_set_udp_seg(pkt); > + > + /* For UDP GSO, udp checksum must be calculated by software */ > + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { > + void *l3_hdr, *l4_hdr; > + struct rte_udp_hdr *udp_hdr; > + > + /* PKT_TX_UDP_CKSUM must be cleaned for GSO because > + * udp checksum only can be caculated by software for > + * GSO case. > + */ > + mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM; > + > + eth_hdr = (struct rte_ether_hdr *) > + ((uint8_t *)eth_hdr + mbuf->outer_l2_len + > + mbuf->outer_l3_len + > + sizeof(struct udp_header) + > + sizeof(struct vxlanhdr)); > + l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len - > + sizeof(struct udp_header) - > + sizeof(struct vxlanhdr); > + l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len; > + ip_hdr = (struct rte_ipv4_hdr *)l3_hdr; > + ip_hdr->hdr_checksum = 0; > + ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr); > + /* Don't touch UDP checksum if it is ip fragment */ > + if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) { > + udp_hdr = (struct rte_udp_hdr *)l4_hdr; > + udp_hdr->dgram_cksum = 0; > + udp_hdr->dgram_cksum = > + get_udptcp_checksum(l3_hdr, l4_hdr, > + eth_hdr->ether_type); > + } > + } > + > + /* FOR GSO, gso_size includes l2_len + l3_len */ > + mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + > + mbuf->l2_len; > + if (mbuf->tso_segsz > dev->mtu) { > + mbuf->tso_segsz = dev->mtu; > + } > + } > + } > + > return true; > } > > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > return cnt; > } > > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes > ownership of > - * 'pkts', even in case of failure. > - * > - * Returns the number of packets that weren't transmitted. */ static > inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > - struct rte_mbuf **pkts, int cnt) > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > + struct rte_mbuf **pkts, int cnt) > { > uint32_t nb_tx = 0; > - uint16_t nb_tx_prep = cnt; > + uint32_t nb_tx_prep; > > - if (userspace_tso_enabled()) { > - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > - if (nb_tx_prep != cnt) { > - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, > - cnt, rte_strerror(rte_errno)); > - } > + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > + if (nb_tx_prep != cnt) { > + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > + "Only %u/%u are valid: %s", > + dev->up.name, nb_tx_prep, > + cnt, rte_strerror(rte_errno)); > } > > while (nb_tx != nb_tx_prep) { > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > return cnt - nb_tx; > } > > +static inline void > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf.
On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > -----邮件原件----- > 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets > 发送时间: 2020年10月27日 21:03 > 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org > 抄送: fbl@sysclose.org; i.maximets@ovn.org > 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path > > On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > > From: Yi Yang <yangyi01@inspur.com> > > > > GSO(Generic Segment Offload) can segment large UDP and TCP packet to > > small packets per MTU of destination , especially for the case that > > physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, GSO > > can make sure userspace TSO can still work but not drop. > > > > In addition, GSO can help improve UDP performane when UFO is enabled > > in VM. > > > > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx > > function of physical NIC. > > > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > > --- > > lib/dp-packet.h | 21 +++- > > lib/netdev-dpdk.c | 358 > > +++++++++++++++++++++++++++++++++++++++++++++++++---- > > lib/netdev-linux.c | 17 ++- > > lib/netdev.c | 67 +++++++--- > > 4 files changed, 417 insertions(+), 46 deletions(-) snip > > > > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > > return cnt; > > } > > > > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes > > ownership of > > - * 'pkts', even in case of failure. > > - * > > - * Returns the number of packets that weren't transmitted. */ static > > inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > > - struct rte_mbuf **pkts, int cnt) > > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > > + struct rte_mbuf **pkts, int cnt) > > { > > uint32_t nb_tx = 0; > > - uint16_t nb_tx_prep = cnt; > > + uint32_t nb_tx_prep; > > > > - if (userspace_tso_enabled()) { > > - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > > - if (nb_tx_prep != cnt) { > > - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > > - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, > > - cnt, rte_strerror(rte_errno)); > > - } > > + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > > + if (nb_tx_prep != cnt) { > > + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > > + "Only %u/%u are valid: %s", > > + dev->up.name, nb_tx_prep, > > + cnt, rte_strerror(rte_errno)); > > } > > > > while (nb_tx != nb_tx_prep) { > > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > > return cnt - nb_tx; > > } > > > > +static inline void > > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) > > I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. > > [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf. > Hi Ilya, Now I understand Yi Yang's point better and I agree with him. Looks like the patch does the GSO at the DPDK TX function. It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK? William
Yes, GSO is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue. -----邮件原件----- 发件人: William Tu [mailto:u9012063@gmail.com] 发送时间: 2021年2月7日 23:46 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > -----邮件原件----- > 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets > 发送时间: 2020年10月27日 21:03 > 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org > 抄送: fbl@sysclose.org; i.maximets@ovn.org > 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path > > On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > > From: Yi Yang <yangyi01@inspur.com> > > > > GSO(Generic Segment Offload) can segment large UDP and TCP packet > > to small packets per MTU of destination , especially for the case > > that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, > > GSO can make sure userspace TSO can still work but not drop. > > > > In addition, GSO can help improve UDP performane when UFO is enabled > > in VM. > > > > GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx > > function of physical NIC. > > > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > > --- > > lib/dp-packet.h | 21 +++- > > lib/netdev-dpdk.c | 358 > > +++++++++++++++++++++++++++++++++++++++++++++++++---- > > lib/netdev-linux.c | 17 ++- > > lib/netdev.c | 67 +++++++--- > > 4 files changed, 417 insertions(+), 46 deletions(-) snip > > > > @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > > return cnt; > > } > > > > -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes > > ownership of > > - * 'pkts', even in case of failure. > > - * > > - * Returns the number of packets that weren't transmitted. */ > > static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > > - struct rte_mbuf **pkts, int cnt) > > +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > > + struct rte_mbuf **pkts, int cnt) > > { > > uint32_t nb_tx = 0; > > - uint16_t nb_tx_prep = cnt; > > + uint32_t nb_tx_prep; > > > > - if (userspace_tso_enabled()) { > > - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > > - if (nb_tx_prep != cnt) { > > - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > > - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, > > - cnt, rte_strerror(rte_errno)); > > - } > > + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > > + if (nb_tx_prep != cnt) { > > + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > > + "Only %u/%u are valid: %s", > > + dev->up.name, nb_tx_prep, > > + cnt, rte_strerror(rte_errno)); > > } > > > > while (nb_tx != nb_tx_prep) { > > @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > > return cnt - nb_tx; > > } > > > > +static inline void > > +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) > > I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. > > [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf. > Hi Ilya, Now I understand Yi Yang's point better and I agree with him. Looks like the patch does the GSO at the DPDK TX function. It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK? William
On 2/8/21 1:56 AM, Yi Yang (杨燚)-云服务集团 wrote: > Yes, GSO is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue. > > -----邮件原件----- > 发件人: William Tu [mailto:u9012063@gmail.com] > 发送时间: 2021年2月7日 23:46 > 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> > 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org > 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path > > On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: >> >> -----邮件原件----- >> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets >> 发送时间: 2020年10月27日 21:03 >> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org >> 抄送: fbl@sysclose.org; i.maximets@ovn.org >> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path >> >> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: >>> From: Yi Yang <yangyi01@inspur.com> >>> >>> GSO(Generic Segment Offload) can segment large UDP and TCP packet >>> to small packets per MTU of destination , especially for the case >>> that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, >>> GSO can make sure userspace TSO can still work but not drop. >>> >>> In addition, GSO can help improve UDP performane when UFO is enabled >>> in VM. >>> >>> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx >>> function of physical NIC. >>> >>> Signed-off-by: Yi Yang <yangyi01@inspur.com> >>> --- >>> lib/dp-packet.h | 21 +++- >>> lib/netdev-dpdk.c | 358 >>> +++++++++++++++++++++++++++++++++++++++++++++++++---- >>> lib/netdev-linux.c | 17 ++- >>> lib/netdev.c | 67 +++++++--- >>> 4 files changed, 417 insertions(+), 46 deletions(-) > > snip > >>> >>> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, >>> return cnt; >>> } >>> >>> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes >>> ownership of >>> - * 'pkts', even in case of failure. >>> - * >>> - * Returns the number of packets that weren't transmitted. */ >>> static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, >>> - struct rte_mbuf **pkts, int cnt) >>> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, >>> + struct rte_mbuf **pkts, int cnt) >>> { >>> uint32_t nb_tx = 0; >>> - uint16_t nb_tx_prep = cnt; >>> + uint32_t nb_tx_prep; >>> >>> - if (userspace_tso_enabled()) { >>> - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); >>> - if (nb_tx_prep != cnt) { >>> - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " >>> - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, >>> - cnt, rte_strerror(rte_errno)); >>> - } >>> + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); >>> + if (nb_tx_prep != cnt) { >>> + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " >>> + "Only %u/%u are valid: %s", >>> + dev->up.name, nb_tx_prep, >>> + cnt, rte_strerror(rte_errno)); >>> } >>> >>> while (nb_tx != nb_tx_prep) { >>> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, >>> return cnt - nb_tx; >>> } >>> >>> +static inline void >>> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) >> >> I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. >> >> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf. >> > > Hi Ilya, > > Now I understand Yi Yang's point better and I agree with him. > Looks like the patch does the GSO at the DPDK TX function. > It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK? Hi. GSO itself should be possible to implement, but we should not enable support for multi-segment mbufs in the NIC, since this might end up in multi-segment packets being received by OVS causing lots of trouble (for example dp_packet_clone() uses simple memcpy() that will result in invalid memory reads.) GSO should be implemented in a same way TSO implemented. Right now TSO implementation has no software fallback to segment large packets if netdev doesn't support - it it simply drops them. However, software fallback should be implemented someday and it should be implemented in a generic way for all the netdev types instead of doing it only for one (netdev-dpdk). Brief look at the patch tells me that the generic check from netdev_send_prepare_packet() was removed. This means that huge packets could reach e.g. netdev-afxdp that will not behave correctly. Generic software fallback should produce sequence of 'dp_packet's instead of multi-segment mbufs, so there should be no need to enable support for multi-segment mbufs in DPDK PMD. Best regards, Ilya Maximets.
On Mon, Feb 8, 2021 at 6:36 AM Ilya Maximets <i.maximets@ovn.org> wrote: > > On 2/8/21 1:56 AM, Yi Yang (杨燚)-云服务集团 wrote: > > Yes, GSO is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue. > > > > -----邮件原件----- > > 发件人: William Tu [mailto:u9012063@gmail.com] > > 发送时间: 2021年2月7日 23:46 > > 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> > > 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org > > 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path > > > > On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > >> > >> -----邮件原件----- > >> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets > >> 发送时间: 2020年10月27日 21:03 > >> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org > >> 抄送: fbl@sysclose.org; i.maximets@ovn.org > >> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path > >> > >> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > >>> From: Yi Yang <yangyi01@inspur.com> > >>> > >>> GSO(Generic Segment Offload) can segment large UDP and TCP packet > >>> to small packets per MTU of destination , especially for the case > >>> that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, > >>> GSO can make sure userspace TSO can still work but not drop. > >>> > >>> In addition, GSO can help improve UDP performane when UFO is enabled > >>> in VM. > >>> > >>> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx > >>> function of physical NIC. > >>> > >>> Signed-off-by: Yi Yang <yangyi01@inspur.com> > >>> --- > >>> lib/dp-packet.h | 21 +++- > >>> lib/netdev-dpdk.c | 358 > >>> +++++++++++++++++++++++++++++++++++++++++++++++++---- > >>> lib/netdev-linux.c | 17 ++- > >>> lib/netdev.c | 67 +++++++--- > >>> 4 files changed, 417 insertions(+), 46 deletions(-) > > > > snip > > > >>> > >>> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > >>> return cnt; > >>> } > >>> > >>> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes > >>> ownership of > >>> - * 'pkts', even in case of failure. > >>> - * > >>> - * Returns the number of packets that weren't transmitted. */ > >>> static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > >>> - struct rte_mbuf **pkts, int cnt) > >>> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > >>> + struct rte_mbuf **pkts, int cnt) > >>> { > >>> uint32_t nb_tx = 0; > >>> - uint16_t nb_tx_prep = cnt; > >>> + uint32_t nb_tx_prep; > >>> > >>> - if (userspace_tso_enabled()) { > >>> - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > >>> - if (nb_tx_prep != cnt) { > >>> - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > >>> - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, > >>> - cnt, rte_strerror(rte_errno)); > >>> - } > >>> + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); > >>> + if (nb_tx_prep != cnt) { > >>> + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " > >>> + "Only %u/%u are valid: %s", > >>> + dev->up.name, nb_tx_prep, > >>> + cnt, rte_strerror(rte_errno)); > >>> } > >>> > >>> while (nb_tx != nb_tx_prep) { > >>> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, > >>> return cnt - nb_tx; > >>> } > >>> > >>> +static inline void > >>> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) > >> > >> I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. > >> > >> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf. > >> > > > > Hi Ilya, > > > > Now I understand Yi Yang's point better and I agree with him. > > Looks like the patch does the GSO at the DPDK TX function. > > It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK? > > Hi. GSO itself should be possible to implement, but we should > not enable support for multi-segment mbufs in the NIC, since this > might end up in multi-segment packets being received by OVS > causing lots of trouble (for example dp_packet_clone() uses simple > memcpy() that will result in invalid memory reads.) I see your point, thanks! I guess you're saying that we have to do s.t like: nic->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS to enable NIC's multi-segment support for GSO. And once enabling it, OVS might receive multi-segment mbuf from the NIC, and OVS doesn't support it. > > GSO should be implemented in a same way TSO implemented. Right now > TSO implementation has no software fallback to segment large packets > if netdev doesn't support - it it simply drops them. However, > software fallback should be implemented someday and it should be > implemented in a generic way for all the netdev types instead of > doing it only for one (netdev-dpdk). Brief look at the patch tells > me that the generic check from netdev_send_prepare_packet() was > removed. This means that huge packets could reach e.g. netdev-afxdp > that will not behave correctly. > Generic software fallback should produce sequence of 'dp_packet's > instead of multi-segment mbufs, so there should be no need to enable > support for multi-segment mbufs in DPDK PMD. > I agree, thanks! So which one do we need, for dp_packet: 1) GSO 2) Tunneled GSO, ex: vxlan and geneve, or 3) Both? I feel we can assume all HW nic today support TSO, but not tunnel packet's TSO. So we only need to implement (2). William
On 2/8/21 8:54 PM, William Tu wrote: > On Mon, Feb 8, 2021 at 6:36 AM Ilya Maximets <i.maximets@ovn.org> wrote: >> >> On 2/8/21 1:56 AM, Yi Yang (杨燚)-云服务集团 wrote: >>> Yes, GSO is ok, but GRO may be have that issue, I didn't see that issue in my openstack environment, so maybe it will be great if we can have a test case to trigger that issue. >>> >>> -----邮件原件----- >>> 发件人: William Tu [mailto:u9012063@gmail.com] >>> 发送时间: 2021年2月7日 23:46 >>> 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> >>> 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org >>> 主题: Re: [ovs-dev] 答复: [PATCH V3 2/4] Add GSO support for DPDK data path >>> >>> On Tue, Oct 27, 2020 at 6:02 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: >>>> >>>> -----邮件原件----- >>>> 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets >>>> 发送时间: 2020年10月27日 21:03 >>>> 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org >>>> 抄送: fbl@sysclose.org; i.maximets@ovn.org >>>> 主题: Re: [ovs-dev] [PATCH V3 2/4] Add GSO support for DPDK data path >>>> >>>> On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: >>>>> From: Yi Yang <yangyi01@inspur.com> >>>>> >>>>> GSO(Generic Segment Offload) can segment large UDP and TCP packet >>>>> to small packets per MTU of destination , especially for the case >>>>> that physical NIC can't do hardware offload VXLAN TSO and VXLAN UFO, >>>>> GSO can make sure userspace TSO can still work but not drop. >>>>> >>>>> In addition, GSO can help improve UDP performane when UFO is enabled >>>>> in VM. >>>>> >>>>> GSO can support TCP, UDP, VXLAN TCP, VXLAN UDP, it is done in Tx >>>>> function of physical NIC. >>>>> >>>>> Signed-off-by: Yi Yang <yangyi01@inspur.com> >>>>> --- >>>>> lib/dp-packet.h | 21 +++- >>>>> lib/netdev-dpdk.c | 358 >>>>> +++++++++++++++++++++++++++++++++++++++++++++++++---- >>>>> lib/netdev-linux.c | 17 ++- >>>>> lib/netdev.c | 67 +++++++--- >>>>> 4 files changed, 417 insertions(+), 46 deletions(-) >>> >>> snip >>> >>>>> >>>>> @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, >>>>> return cnt; >>>>> } >>>>> >>>>> -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes >>>>> ownership of >>>>> - * 'pkts', even in case of failure. >>>>> - * >>>>> - * Returns the number of packets that weren't transmitted. */ >>>>> static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, >>>>> - struct rte_mbuf **pkts, int cnt) >>>>> +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, >>>>> + struct rte_mbuf **pkts, int cnt) >>>>> { >>>>> uint32_t nb_tx = 0; >>>>> - uint16_t nb_tx_prep = cnt; >>>>> + uint32_t nb_tx_prep; >>>>> >>>>> - if (userspace_tso_enabled()) { >>>>> - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); >>>>> - if (nb_tx_prep != cnt) { >>>>> - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " >>>>> - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, >>>>> - cnt, rte_strerror(rte_errno)); >>>>> - } >>>>> + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); >>>>> + if (nb_tx_prep != cnt) { >>>>> + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " >>>>> + "Only %u/%u are valid: %s", >>>>> + dev->up.name, nb_tx_prep, >>>>> + cnt, rte_strerror(rte_errno)); >>>>> } >>>>> >>>>> while (nb_tx != nb_tx_prep) { >>>>> @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, >>>>> return cnt - nb_tx; >>>>> } >>>>> >>>>> +static inline void >>>>> +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) >>>> >>>> I didn't review the patch, only had a quick glance, but this part bothers me. OVS doesn't support multi-segment mbufs, so it should not be possible for such mbufs being transmitted by OVS. So, I do not understand why this function needs to work with such mbufs. >>>> >>>> [Yi Yang] Only DPDK driver/Tx function will use it, not OVS, set_multiseg_udptcp_cksum is called in GSO part, it is last step before Tx function, it is a big external mbuf before rte_gso_segment, that isn't a multi-segmented mbuf. >>>> >>> >>> Hi Ilya, >>> >>> Now I understand Yi Yang's point better and I agree with him. >>> Looks like the patch does the GSO at the DPDK TX function. >>> It creates multi-seg mbuf after rte_gso_segment(), but will immediately send out the multi-seg mbuf to DPDK port, without traversing inside other part of OVS code. I guess this case it should work OK? >> >> Hi. GSO itself should be possible to implement, but we should >> not enable support for multi-segment mbufs in the NIC, since this >> might end up in multi-segment packets being received by OVS >> causing lots of trouble (for example dp_packet_clone() uses simple >> memcpy() that will result in invalid memory reads.) > > I see your point, thanks! > I guess you're saying that we have to do s.t like: > nic->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS > to enable NIC's multi-segment support for GSO. > And once enabling it, OVS might receive multi-segment > mbuf from the NIC, and OVS doesn't support it. OK. This probably is not true. I missed the fact that MULTI_SEGS enabled only for tx offloads. So, this should work, I guess. But still, software fallback should not be implemented only for netdev-dpdk. It's needed for other port types too and it should not depend on DPDK. > >> >> GSO should be implemented in a same way TSO implemented. Right now >> TSO implementation has no software fallback to segment large packets >> if netdev doesn't support - it it simply drops them. However, >> software fallback should be implemented someday and it should be >> implemented in a generic way for all the netdev types instead of >> doing it only for one (netdev-dpdk). Brief look at the patch tells >> me that the generic check from netdev_send_prepare_packet() was >> removed. This means that huge packets could reach e.g. netdev-afxdp >> that will not behave correctly. >> Generic software fallback should produce sequence of 'dp_packet's >> instead of multi-segment mbufs, so there should be no need to enable >> support for multi-segment mbufs in DPDK PMD. >> > I agree, thanks! > So which one do we need, for dp_packet: > 1) GSO 2) Tunneled GSO, ex: vxlan and geneve, or 3) Both? > > I feel we can assume all HW nic today support TSO, but not > tunnel packet's TSO. So we only need to implement (2). Well, we need everything. We can't depend on fact that all ports supports even TSO since this is under control of a VM configuration in case of a VM attached via vhost-user port. AF_XDP ports doesn't support TSO, some DPDK vdevs probably doesn't support TSO. If OVS is running inside a VM, ports might not support TSO, since this depends on what is the underlying port implementation and how the VM was configured.
diff --git a/lib/dp-packet.h b/lib/dp-packet.h index 79895f2..c33868d 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -83,6 +83,8 @@ enum dp_packet_offload_mask { DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, PKT_TX_SCTP_CKSUM, 0x800), /* VXLAN TCP Segmentation Offload. */ DEF_OL_FLAG(DP_PACKET_OL_TX_TUNNEL_VXLAN, PKT_TX_TUNNEL_VXLAN, 0x1000), + /* UDP Segmentation Offload. */ + DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_SEG, PKT_TX_UDP_SEG, 0x2000), /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */ }; @@ -97,7 +99,8 @@ enum dp_packet_offload_mask { DP_PACKET_OL_TX_IPV6 | \ DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ - DP_PACKET_OL_TX_SCTP_CKSUM) + DP_PACKET_OL_TX_SCTP_CKSUM | \ + DP_PACKET_OL_TX_UDP_SEG) #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \ DP_PACKET_OL_TX_UDP_CKSUM | \ @@ -956,6 +959,13 @@ dp_packet_hwol_is_tso(const struct dp_packet *b) return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_TCP_SEG); } +/* Returns 'true' if packet 'b' is marked for UDP segmentation offloading. */ +static inline bool +dp_packet_hwol_is_uso(const struct dp_packet *b) +{ + return !!(*dp_packet_ol_flags_ptr(b) & DP_PACKET_OL_TX_UDP_SEG); +} + /* Returns 'true' if packet 'b' is marked for IPv4 checksum offloading. */ static inline bool dp_packet_hwol_is_ipv4(const struct dp_packet *b) @@ -1034,6 +1044,15 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b) *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG; } +/* Mark packet 'b' for UDP segmentation offloading. It implies that + * either the packet 'b' is marked for IPv4 or IPv6 checksum offloading + * and also for UDP checksum offloading. */ +static inline void +dp_packet_hwol_set_udp_seg(struct dp_packet *b) +{ + *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_UDP_SEG; +} + #ifdef DPDK_NETDEV /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static inline void diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 30493ed..888a45e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -38,13 +38,15 @@ #include <rte_errno.h> #include <rte_ethdev.h> #include <rte_flow.h> +#include <rte_gso.h> +#include <rte_ip.h> #include <rte_malloc.h> #include <rte_mbuf.h> #include <rte_meter.h> #include <rte_pci.h> #include <rte_version.h> #include <rte_vhost.h> -#include <rte_ip.h> +#include <rte_ip_frag.h> #include "cmap.h" #include "coverage.h" @@ -162,6 +164,7 @@ typedef uint16_t dpdk_port_t; | DEV_TX_OFFLOAD_UDP_CKSUM \ | DEV_TX_OFFLOAD_IPV4_CKSUM) +#define MAX_GSO_MBUFS 64 static const struct rte_eth_conf port_conf = { .rxmode = { @@ -2171,6 +2174,16 @@ is_local_to_local(uint16_t src_port_id, struct netdev_dpdk *dev) return ret; } +static uint16_t +get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) +{ + if (ethertype == htons(RTE_ETHER_TYPE_IPV4)) { + return rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr); + } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */ + return rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr); + } +} + /* Prepare the packet for HWOL. * Return True if the packet is OK to continue. */ static bool @@ -2203,10 +2216,9 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) * also can't be handled. So PKT_TX_TUNNEL_VXLAN must be cleared * outer_l2_len and outer_l3_len must be zeroed. */ - if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO) - || (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) + if (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)) && (mbuf->pkt_len <= 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len - + mbuf->l2_len))) { + + mbuf->l2_len)) { mbuf->ol_flags &= ~PKT_TX_TUNNEL_VXLAN; mbuf->l2_len -= sizeof(struct udp_header) + sizeof(struct vxlanhdr); @@ -2249,7 +2261,7 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) /* inner IP checksum offload offload */ mbuf->ol_flags |= PKT_TX_IP_CKSUM; } - } else if (mbuf->ol_flags & PKT_TX_L4_MASK) { + } else if (mbuf->ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)) { /* Handle VLAN TSO */ /* no inner IP checksum for IPV6 */ if (mbuf->ol_flags & PKT_TX_IPV4) { @@ -2273,6 +2285,18 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt); mbuf->outer_l2_len = 0; mbuf->outer_l3_len = 0; + + /* In case of GRO, PKT_TX_TCP_SEG or PKT_TX_UDP_SEG wasn't set by GRO + * APIs, here is a place we can mark it. + */ + if ((mbuf->pkt_len > 1464) + && (!(mbuf->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { + if (l4_proto == IPPROTO_UDP) { + mbuf->ol_flags |= PKT_TX_UDP_SEG; + } else if (l4_proto == IPPROTO_TCP) { + mbuf->ol_flags |= PKT_TX_TCP_SEG; + } + } } /* It is possible that l4_len isn't set for vhostuserclient */ @@ -2284,6 +2308,10 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->l4_len = (tcp_hdr->data_off & 0xf0) >> 2; } + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { + return true; + } + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { if (l4_proto != IPPROTO_UDP) { VLOG_WARN_RL(&rl, "%s: UDP packet without L4 header" @@ -2294,11 +2322,13 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) mbuf->ol_flags & PKT_TX_TCP_CKSUM) { if (l4_proto != IPPROTO_TCP) { VLOG_WARN_RL(&rl, "%s: TCP Segmentation without L4 header" - " pkt len: %"PRIu32"", dev->up.name, mbuf->pkt_len); + " pkt len: %"PRIu32" l4_proto = %d", + dev->up.name, mbuf->pkt_len, l4_proto); return false; } - if (mbuf->pkt_len - mbuf->l2_len > 1450) { + if (mbuf->pkt_len > 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + + mbuf->l2_len) { dp_packet_hwol_set_tcp_seg(pkt); } @@ -2308,7 +2338,66 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) } else { mbuf->tso_segsz = 0; } + + if (!(dev->up.ol_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + /* PKT_TX_TCP_CKSUM must be cleaned for GSO because + * tcp checksum only can be caculated by software for + * GSO case. + */ + mbuf->ol_flags &= ~PKT_TX_TCP_CKSUM; + } } + + /* UDP GSO if necessary */ + if (l4_proto == IPPROTO_UDP) { + /* VXLAN GSO can be done here */ + if ((mbuf->ol_flags & PKT_TX_UDP_SEG) || + (mbuf->pkt_len > (1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + + mbuf->l2_len))) { + dp_packet_hwol_set_udp_seg(pkt); + + /* For UDP GSO, udp checksum must be calculated by software */ + if ((mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM) { + void *l3_hdr, *l4_hdr; + struct rte_udp_hdr *udp_hdr; + + /* PKT_TX_UDP_CKSUM must be cleaned for GSO because + * udp checksum only can be caculated by software for + * GSO case. + */ + mbuf->ol_flags &= ~PKT_TX_UDP_CKSUM; + + eth_hdr = (struct rte_ether_hdr *) + ((uint8_t *)eth_hdr + mbuf->outer_l2_len + + mbuf->outer_l3_len + + sizeof(struct udp_header) + + sizeof(struct vxlanhdr)); + l3_hdr = (uint8_t *)eth_hdr + mbuf->l2_len - + sizeof(struct udp_header) - + sizeof(struct vxlanhdr); + l4_hdr = (uint8_t *)l3_hdr + mbuf->l3_len; + ip_hdr = (struct rte_ipv4_hdr *)l3_hdr; + ip_hdr->hdr_checksum = 0; + ip_hdr->hdr_checksum = rte_ipv4_cksum(ip_hdr); + /* Don't touch UDP checksum if it is ip fragment */ + if (!rte_ipv4_frag_pkt_is_fragmented(ip_hdr)) { + udp_hdr = (struct rte_udp_hdr *)l4_hdr; + udp_hdr->dgram_cksum = 0; + udp_hdr->dgram_cksum = + get_udptcp_checksum(l3_hdr, l4_hdr, + eth_hdr->ether_type); + } + } + + /* FOR GSO, gso_size includes l2_len + l3_len */ + mbuf->tso_segsz = 1450 + mbuf->outer_l2_len + mbuf->outer_l3_len + + mbuf->l2_len; + if (mbuf->tso_segsz > dev->mtu) { + mbuf->tso_segsz = dev->mtu; + } + } + } + return true; } @@ -2339,24 +2428,19 @@ netdev_dpdk_prep_hwol_batch(struct netdev_dpdk *dev, struct rte_mbuf **pkts, return cnt; } -/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of - * 'pkts', even in case of failure. - * - * Returns the number of packets that weren't transmitted. */ static inline int -netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, - struct rte_mbuf **pkts, int cnt) +__netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, + struct rte_mbuf **pkts, int cnt) { uint32_t nb_tx = 0; - uint16_t nb_tx_prep = cnt; + uint32_t nb_tx_prep; - if (userspace_tso_enabled()) { - nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); - if (nb_tx_prep != cnt) { - VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " - "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep, - cnt, rte_strerror(rte_errno)); - } + nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt); + if (nb_tx_prep != cnt) { + VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. " + "Only %u/%u are valid: %s", + dev->up.name, nb_tx_prep, + cnt, rte_strerror(rte_errno)); } while (nb_tx != nb_tx_prep) { @@ -2384,6 +2468,200 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, return cnt - nb_tx; } +static inline void +set_multiseg_udptcp_cksum(struct rte_mbuf *mbuf) +{ + uint16_t l3_offset = mbuf->outer_l2_len + mbuf->outer_l3_len + + mbuf->l2_len; + struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *) + (rte_pktmbuf_mtod(mbuf, char *) + l3_offset); + struct rte_tcp_hdr *tcp_hdr; + uint32_t l4_hdr_len; + uint8_t *l4_hdr; + struct rte_mbuf *next = mbuf->next; + uint32_t cksum = 0; + uint16_t l4_proto; + uint32_t inner_cksum; + + l4_proto = ipv4_hdr->next_proto_id; + if ((l4_proto != IPPROTO_UDP) && (l4_proto != IPPROTO_TCP)) { + return; + } + + if (l4_proto == IPPROTO_TCP) { + /* For TCP GSO, inner TCP header is in every seg, + * TCP checksum has to be calculated by software. + */ + + l4_hdr_len = mbuf->data_len - l3_offset + - sizeof(struct rte_ipv4_hdr); + l4_hdr = (uint8_t *)(ipv4_hdr + 1); + tcp_hdr = (struct rte_tcp_hdr *)l4_hdr; + tcp_hdr->cksum = 0; + } + + /* Set inner ip checksum */ + ipv4_hdr->hdr_checksum = 0; + ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr); + + if (l4_proto == IPPROTO_TCP) { + cksum = rte_raw_cksum(l4_hdr, l4_hdr_len); + } else if (l4_proto == IPPROTO_UDP) { + if (next == NULL) { + /* It wasn't GSOed */ + cksum = rte_raw_cksum(ipv4_hdr + 1, + ntohs(ipv4_hdr->total_length) + - sizeof(struct rte_ipv4_hdr)); + } else { + cksum = 0; + } + } + + /* It was GSOed */ + while (next) { + cksum += rte_raw_cksum(rte_pktmbuf_mtod(next, char *), next->data_len); + next = next->next; + } + + /* Save cksum to inner_cksum, outer udp checksum needs it */ + inner_cksum = cksum; + + cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0); + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); + cksum = (~cksum) & 0xffff; + if (cksum == 0) { + cksum = 0xffff; + } + + /* Set inner TCP checksum */ + if (l4_proto == IPPROTO_TCP) { + tcp_hdr->cksum = (uint16_t)cksum; + } + + /* Set outer udp checksum in case of VXLAN */ + if (mbuf->outer_l2_len != 0) { + ipv4_hdr = (struct rte_ipv4_hdr *) + (rte_pktmbuf_mtod(mbuf, char *) + mbuf->outer_l2_len); + struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *) + (ipv4_hdr + 1); + + /* Set outer ip checksum */ + ipv4_hdr->hdr_checksum = 0; + ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr); + + udp_hdr->dgram_cksum = 0; + cksum = rte_ipv4_phdr_cksum(ipv4_hdr, 0); + cksum += rte_raw_cksum(udp_hdr, mbuf->l2_len + mbuf->l3_len); + cksum += inner_cksum; + if (l4_proto == IPPROTO_TCP) { + cksum += tcp_hdr->cksum; + } + cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff); + cksum = (~cksum) & 0xffff; + if (cksum == 0) { + cksum = 0xffff; + } + udp_hdr->dgram_cksum = (uint16_t)cksum; + } +} + +/* Tries to transmit 'pkts' to txq 'qid' of device 'dev'. Takes ownership of + * 'pkts', even in case of failure. + * + * Returns the number of packets that weren't transmitted. */ +static inline int +netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int qid, + struct rte_mbuf **pkts, int cnt) +{ + uint32_t nb_tx = 0; + int i; + int ret; + int failures = 0; + + if (userspace_tso_enabled()) { + /* The best point to do gso */ + struct rte_gso_ctx gso_ctx; + struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS]; + int tx_start = -1; + + /* Setup gso context */ + gso_ctx.direct_pool = dev->dpdk_mp->mp; + gso_ctx.indirect_pool = dev->dpdk_mp->mp; + + /* Do GSO if needed */ + for (i = 0; i < cnt; i++) { + if (((pkts[i]->ol_flags & PKT_TX_UDP_SEG) && + !(dev->hw_ol_features & DEV_TX_OFFLOAD_UDP_TSO)) || + ((pkts[i]->ol_flags & PKT_TX_TCP_SEG) && + ((!(dev->hw_ol_features & NETDEV_TX_VXLAN_TNL_TSO_OFFLOAD) + && (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN)) + || !(dev->hw_ol_features & DEV_TX_OFFLOAD_TCP_TSO)))) { + /* Send non GSO packets before pkts[i] */ + if (tx_start != -1) { + failures += __netdev_dpdk_eth_tx_burst( + dev, qid, + pkts + tx_start, + i - tx_start); + } + tx_start = -1; + + gso_ctx.gso_types = 0; + gso_ctx.gso_size = pkts[i]->tso_segsz; + gso_ctx.flag = 0; + if (pkts[i]->ol_flags & PKT_TX_TUNNEL_VXLAN) { + gso_ctx.gso_types |= DEV_TX_OFFLOAD_VXLAN_TNL_TSO; + } + if (pkts[i]->ol_flags & PKT_TX_UDP_SEG) { + gso_ctx.gso_types |= DEV_TX_OFFLOAD_UDP_TSO; + } else if (pkts[i]->ol_flags & PKT_TX_TCP_SEG) { + gso_ctx.gso_types |= DEV_TX_OFFLOAD_TCP_TSO; + pkts[i]->ol_flags &= ~PKT_TX_TCP_CKSUM; + } + ret = rte_gso_segment(pkts[i], /* packet to segment */ + &gso_ctx, /* gso context */ + /* gso output mbufs */ + (struct rte_mbuf **)&gso_mbufs, + MAX_GSO_MBUFS); + if (ret < 0) { + rte_pktmbuf_free(pkts[i]); + } else { + int j, k; + struct rte_mbuf * next_part; + nb_tx = ret; + for (j = 0; j < nb_tx; j++) { + set_multiseg_udptcp_cksum(gso_mbufs[j]); + /* Clear them because of no offload */ + gso_mbufs[j]->ol_flags = 0; + gso_mbufs[j]->outer_l2_len = 0; + gso_mbufs[j]->outer_l3_len = 0; + gso_mbufs[j]->l2_len = 0; + gso_mbufs[j]->l3_len = 0; + gso_mbufs[j]->l4_len = 0; + next_part = gso_mbufs[j]; + for (k = 0; k < gso_mbufs[j]->nb_segs; k++) { + next_part = next_part->next; + } + } + __netdev_dpdk_eth_tx_burst(dev, qid, gso_mbufs, nb_tx); + } + continue; + } + if (tx_start == -1) { + tx_start = i; + } + } + + if (tx_start != -1) { + /* Send non GSO packets before pkts[i] */ + failures += __netdev_dpdk_eth_tx_burst(dev, qid, pkts + tx_start, + i - tx_start); + } + return failures; + } + + return __netdev_dpdk_eth_tx_burst(dev, qid, pkts, cnt); +} + static inline bool netdev_dpdk_srtcm_policer_pkt_handle(struct rte_meter_srtcm *meter, struct rte_meter_srtcm_profile *profile, @@ -2786,10 +3064,24 @@ out: } } +struct shinfo_arg { + void * buf; + struct rte_mbuf *mbuf; +}; + +/* For GSO case, the extended mbuf only can be freed by + * netdev_dpdk_extbuf_free + */ static void -netdev_dpdk_extbuf_free(void *addr OVS_UNUSED, void *opaque) +netdev_dpdk_extbuf_free(struct rte_mbuf *m, void *opaque) { - rte_free(opaque); + struct shinfo_arg *arg = (struct shinfo_arg *)opaque; + + rte_free(arg->buf); + if (m != arg->mbuf) { + rte_pktmbuf_free(arg->mbuf); + } + free(arg); } static struct rte_mbuf * @@ -2821,8 +3113,11 @@ dpdk_pktmbuf_attach_extbuf(struct rte_mbuf *pkt, uint32_t data_len) /* Initialize shinfo. */ if (shinfo) { + struct shinfo_arg *arg = xmalloc(sizeof(struct shinfo_arg)); + arg->buf = buf; + arg->mbuf = pkt; shinfo->free_cb = netdev_dpdk_extbuf_free; - shinfo->fcb_opaque = buf; + shinfo->fcb_opaque = arg; rte_mbuf_ext_refcnt_set(shinfo, 1); } else { shinfo = rte_pktmbuf_ext_shinfo_init_helper(buf, &buf_len, @@ -2852,6 +3147,10 @@ dpdk_pktmbuf_alloc(struct rte_mempool *mp, uint32_t data_len) return NULL; } + if (unlikely(pkt->shinfo != NULL)) { + pkt->shinfo = NULL; + } + if (rte_pktmbuf_tailroom(pkt) >= data_len) { return pkt; } @@ -5192,6 +5491,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) int err; uint64_t vhost_flags = 0; uint64_t vhost_unsup_flags; + uint64_t vhost_supported_flags; bool zc_enabled; ovs_mutex_lock(&dev->mutex); @@ -5277,6 +5577,16 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) goto unlock; } + err = rte_vhost_driver_get_features(dev->vhost_id, + &vhost_supported_flags); + if (err) { + VLOG_ERR("rte_vhost_driver_get_features failed for " + "vhost user client port: %s\n", dev->up.name); + goto unlock; + } + VLOG_INFO("vhostuserclient port %s features: 0x%016lx", + dev->up.name, vhost_supported_flags); + err = rte_vhost_driver_start(dev->vhost_id); if (err) { VLOG_ERR("rte_vhost_driver_start failed for vhost user " diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 9f830b4..557f139 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -6566,6 +6566,16 @@ netdev_linux_parse_l2(struct dp_packet *b, uint16_t *l4proto) l4_len = TCP_OFFSET(tcp_hdr->tcp_ctl) * 4; dp_packet_hwol_set_l4_len(b, l4_len); + } else if (*l4proto == IPPROTO_UDP) { + struct udp_header *udp_hdr = dp_packet_at(b, l2_len + l3_len, + sizeof(struct udp_header)); + + if (!udp_hdr) { + return -EINVAL; + } + + l4_len = sizeof(struct udp_header); + dp_packet_hwol_set_l4_len(b, l4_len); } return 0; @@ -6581,10 +6591,6 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) return -EINVAL; } - if (vnet->flags == 0 && vnet->gso_type == VIRTIO_NET_HDR_GSO_NONE) { - return 0; - } - if (netdev_linux_parse_l2(b, &l4proto)) { return -EINVAL; } @@ -6609,6 +6615,9 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) || type == VIRTIO_NET_HDR_GSO_TCPV6) { dp_packet_hwol_set_tcp_seg(b); } + if (type == VIRTIO_NET_HDR_GSO_UDP) { + dp_packet_hwol_set_udp_seg(b); + } } return 0; diff --git a/lib/netdev.c b/lib/netdev.c index 64583d1..02f28c8 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -825,23 +825,41 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, { uint64_t l4_mask; - if (dp_packet_hwol_is_vxlan_tcp_seg(packet) - && (dp_packet_hwol_is_tso(packet) || dp_packet_hwol_l4_mask(packet)) - && !(netdev_flags & NETDEV_TX_OFFLOAD_VXLAN_TSO)) { - /* Fall back to GSO in software. */ - VLOG_ERR_BUF(errormsg, "No VXLAN TSO support"); - return false; - } - - if (dp_packet_hwol_is_tso(packet) - && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { - /* Fall back to GSO in software. */ - VLOG_ERR_BUF(errormsg, "No TSO support"); - return false; - } - + /* GSO can handle TSO by software even if device can't handle hardware + * offload, so needn't check it here. + */ l4_mask = dp_packet_hwol_l4_mask(packet); if (l4_mask) { + /* Calculate checksum for VLAN TSO case when no hardware offload + * feature is available. Note: for VXLAN TSO case, checksum has + * been calculated before here, so it won't be done here again + * because checksum flags in packet->m.ol_flags have been cleaned. + */ + if (dp_packet_hwol_l4_is_tcp(packet) + && !dp_packet_hwol_is_vxlan_tcp_seg(packet) + && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM; + /* Only calculate TCP checksum for non-TSO packet, + * it will be calculated after GSO for TSO packet. + */ + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) { + calculate_tcpudp_checksum(packet); + } + return true; + } else if (dp_packet_hwol_l4_is_udp(packet) + && !dp_packet_hwol_is_vxlan_tcp_seg(packet) + && !(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM; + /* Only calculate UDP checksum for non-UFO packet, + * it will be calculated immediately before GSO for + * UFO packet. + */ + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) { + calculate_tcpudp_checksum(packet); + } + return true; + } + if (dp_packet_hwol_l4_is_tcp(packet)) { if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { /* Fall back to TCP csum in software. */ @@ -1013,11 +1031,26 @@ netdev_push_header(const struct netdev *netdev, /* VXLAN offload can't support udp checksum offload * for inner udp packet, so udp checksum must be set * before push header in order that outer checksum can - * be set correctly. + * be set correctly. But GSO code will set udp checksum + * if packet->mbuf.ol_flags has DP_PACKET_OL_TX_UDP_SEG. */ if (dp_packet_hwol_l4_is_udp(packet)) { packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_UDP_CKSUM; - calculate_tcpudp_checksum(packet); + /* Only calculate UDP checksum for non-UFO packet, + * it will be calculated immediately before GSO for + * UFO packet. + */ + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_UDP_SEG)) { + calculate_tcpudp_checksum(packet); + } + } else if (dp_packet_hwol_l4_is_tcp(packet)) { + packet->mbuf.ol_flags &= ~DP_PACKET_OL_TX_TCP_CKSUM; + /* Only calculate TCP checksum for non-TSO packet, + * it will be calculated after GSO for TSO packet. + */ + if (!(packet->mbuf.ol_flags & DP_PACKET_OL_TX_TCP_SEG)) { + calculate_tcpudp_checksum(packet); + } } } netdev->netdev_class->push_header(netdev, packet, data);