Message ID | 20200807105648.94860-4-yang_y_yi@163.com |
---|---|
State | Changes Requested |
Headers | show |
Series | userspace: enable VXLAN TSO, GSO and GRO | expand |
On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > From: Yi Yang <yangyi01@inspur.com> > > GRO(Generic Receive Offload) can help improve performance > when TSO (TCP Segment Offload) or VXLAN TSO is enabled > on transmit side, this can avoid overhead of ovs DPDK > data path and enqueue vhost for VM by merging many small > packets to large packets (65535 bytes at most) once it > receives packets from physical NIC. IIUC, this patch allows multi-segment mbufs to float across different parts of OVS. This will definitely crash it somewhere. Much more changes all over the OVS required to make it safely work with such mbufs. There were few attempts to introduce this support, but all of them ended up being rejected. As it is this patch is not acceptable as it doesn't cover almost anything beside simple cases inside netdev implementation. Here is the latest attempt with multi-segment mbufs: https://patchwork.ozlabs.org/project/openvswitch/list/?series=130193&state=* Best regards, Ilya Maximets. > > It can work for both VXLAN and vlan case. > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > --- > lib/dp-packet.h | 37 ++++++++- > lib/netdev-dpdk.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++++- > lib/netdev-linux.c | 112 ++++++++++++++++++++++++-- > 3 files changed, 365 insertions(+), 11 deletions(-) > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h > index c33868d..18307c0 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -580,7 +580,16 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v) > * (and thus 'v') will always be <= UINT16_MAX; this means that there is no > * loss of accuracy in assigning 'v' to 'data_len'. > */ > - b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ > + if (b->mbuf.nb_segs <= 1) { > + b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ > + } else { > + /* For multi-seg packet, if it is resize, data_len should be > + * adjusted by offset, this will happend in case of push or pop. > + */ > + if (b->mbuf.pkt_len != 0) { > + b->mbuf.data_len += v - b->mbuf.pkt_len; > + } > + } > b->mbuf.pkt_len = v; /* Total length of all segments linked to > * this segment. */ > } > @@ -1092,6 +1101,20 @@ dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len) > { > b->mbuf.l4_len = l4_len; > } > + > +/* Set outer_l2_len for the packet 'b' */ > +static inline void > +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len) > +{ > + b->mbuf.outer_l2_len = outer_l2_len; > +} > + > +/* Set outer_l3_len for the packet 'b' */ > +static inline void > +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len) > +{ > + b->mbuf.outer_l3_len = outer_l3_len; > +} > #else > /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ > static inline void > @@ -1125,6 +1148,18 @@ dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED, > int l4_len OVS_UNUSED) > { > } > + > +/* Set outer_l2_len for the packet 'b' */ > +static inline void > +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len) > +{ > +} > + > +/* Set outer_l3_len for the packet 'b' */ > +static inline void > +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len) > +{ > +} > #endif /* DPDK_NETDEV */ > > static inline bool > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 888a45e..b6c57a6 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -25,6 +25,7 @@ > #include <linux/virtio_net.h> > #include <sys/socket.h> > #include <linux/if.h> > +#include <linux/unistd.h> > > /* Include rte_compat.h first to allow experimental API's needed for the > * rte_meter.h rfc4115 functions. Once they are no longer marked as > @@ -47,6 +48,7 @@ > #include <rte_version.h> > #include <rte_vhost.h> > #include <rte_ip_frag.h> > +#include <rte_gro.h> > > #include "cmap.h" > #include "coverage.h" > @@ -2184,6 +2186,8 @@ get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) > } > } > > +#define UDP_VXLAN_ETH_HDR_SIZE 30 > + > /* Prepare the packet for HWOL. > * Return True if the packet is OK to continue. */ > static bool > @@ -2207,6 +2211,42 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > return true; > } > > + /* ol_flags is cleaned after vxlan pop, so need reset for those packets. > + * Such packets are only for local VMs or namespaces, so need to return > + * after ol_flags, l2_len, l3_len and tso_segsz are set. > + */ > + if (((mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) == 0) && > + (mbuf->l2_len == UDP_VXLAN_ETH_HDR_SIZE) && > + (mbuf->pkt_len > 1464)) { > + mbuf->ol_flags = 0; > + mbuf->l2_len -= sizeof(struct udp_header) > + + sizeof(struct vxlanhdr); > + if (mbuf->l3_len == IP_HEADER_LEN) { > + mbuf->ol_flags |= PKT_TX_IPV4; > + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); > + l4_proto = ip_hdr->next_proto_id; > + } else if (mbuf->l3_len == IPV6_HEADER_LEN) { > + mbuf->ol_flags |= PKT_TX_IPV6; > + ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1); > + l4_proto = ip6_hdr->proto; > + } > + > + mbuf->ol_flags |= PKT_TX_IP_CKSUM; > + if (l4_proto == IPPROTO_TCP) { > + mbuf->ol_flags |= PKT_TX_TCP_SEG; > + mbuf->ol_flags |= PKT_TX_TCP_CKSUM; > + } else if (l4_proto == IPPROTO_UDP) { > + mbuf->ol_flags |= PKT_TX_UDP_SEG; > + mbuf->ol_flags |= PKT_TX_UDP_CKSUM; > + } > + mbuf->tso_segsz = 1450; > + if (mbuf->tso_segsz > dev->mtu) { > + mbuf->tso_segsz = dev->mtu; > + } > + > + return true; > + } > + > if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) { > /* Handle VXLAN TSO */ > struct rte_udp_hdr *udp_hdr; > @@ -2853,6 +2893,104 @@ netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq) > return dev->vhost_rxq_enabled[rxq->queue_id]; > } > > +#define VXLAN_DST_PORT 4789 > + > +static void > +netdev_dpdk_parse_hdr(struct dp_packet *pkt, int offset, uint16_t *l4_proto, > + int *is_frag) > +{ > + struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt; > + struct rte_ether_hdr *eth_hdr = > + rte_pktmbuf_mtod_offset(mbuf, struct rte_ether_hdr *, offset); > + ovs_be16 eth_type; > + int l2_len; > + int l3_len = 0; > + int l4_len = 0; > + uint16_t inner_l4_proto = 0; > + int inner_is_frag = 0; > + > + if (offset == 0) { > + *is_frag = 0; > + } > + mbuf->packet_type = 0; > + l2_len = ETH_HEADER_LEN; > + eth_type = (OVS_FORCE ovs_be16) eth_hdr->ether_type; > + if (eth_type_vlan(eth_type)) { > + struct rte_vlan_hdr *vlan_hdr = > + (struct rte_vlan_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET); > + > + eth_type = (OVS_FORCE ovs_be16) vlan_hdr->eth_proto; > + l2_len += VLAN_HEADER_LEN; > + } > + > + dp_packet_hwol_set_l2_len(pkt, l2_len); > + dp_packet_hwol_set_outer_l2_len(pkt, 0); > + dp_packet_hwol_set_outer_l3_len(pkt, 0); > + > + if (eth_type == htons(ETH_TYPE_IP)) { > + struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *) > + ((char *)eth_hdr + l2_len); > + > + l3_len = IP_HEADER_LEN; > + dp_packet_hwol_set_tx_ipv4(pkt); > + *l4_proto = ipv4_hdr->next_proto_id; > + *is_frag = rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr); > + mbuf->packet_type |= RTE_PTYPE_L3_IPV4; > + } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) { > + struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *) > + ((char *)eth_hdr + l2_len); > + l3_len = IPV6_HEADER_LEN; > + dp_packet_hwol_set_tx_ipv6(pkt); > + *l4_proto = ipv6_hdr->proto; > + } > + > + dp_packet_hwol_set_l3_len(pkt, l3_len); > + > + if (*l4_proto == IPPROTO_TCP) { > + struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *) > + ((char *)eth_hdr + l2_len + l3_len); > + > + l4_len = (tcp_hdr->data_off & 0xf0) >> 2; > + dp_packet_hwol_set_l4_len(pkt, l4_len); > + mbuf->packet_type |= RTE_PTYPE_L4_TCP; > + } else if (*l4_proto == IPPROTO_UDP) { > + struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *) > + ((char *)eth_hdr + l2_len + l3_len); > + > + l4_len = sizeof(*udp_hdr); > + dp_packet_hwol_set_l4_len(pkt, l4_len); > + mbuf->packet_type |= RTE_PTYPE_L4_UDP; > + > + /* Need to parse inner packet if needed */ > + if (ntohs(udp_hdr->dst_port) == VXLAN_DST_PORT) { > + netdev_dpdk_parse_hdr(pkt, > + l2_len + l3_len + l4_len > + + sizeof(struct vxlanhdr), > + &inner_l4_proto, > + &inner_is_frag); > + mbuf->l2_len += sizeof(struct rte_udp_hdr) > + + sizeof(struct vxlanhdr); > + dp_packet_hwol_set_outer_l2_len(pkt, l2_len); > + dp_packet_hwol_set_outer_l3_len(pkt, l3_len); > + > + /* Set packet_type, it is necessary for GRO */ > + mbuf->packet_type |= RTE_PTYPE_TUNNEL_VXLAN; > + if (mbuf->l3_len == IP_HEADER_LEN) { > + mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4; > + } > + if (inner_l4_proto == IPPROTO_TCP) { > + mbuf->packet_type |= RTE_PTYPE_INNER_L4_TCP; > + mbuf->packet_type |= RTE_PTYPE_L4_UDP; > + } else if (inner_l4_proto == IPPROTO_UDP) { > + mbuf->packet_type |= RTE_PTYPE_INNER_L4_UDP; > + mbuf->packet_type |= RTE_PTYPE_L4_UDP; > + } > + } > + } > +} > + > +static RTE_DEFINE_PER_LCORE(void *, _gro_ctx); > + > static int > netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, > int *qfill) > @@ -2862,6 +3000,36 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, > struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev); > int nb_rx; > int dropped = 0; > + struct rte_gro_param gro_param; > + struct dp_packet *packet; > + struct dp_packet *udp_pkts[NETDEV_MAX_BURST]; > + struct dp_packet *other_pkts[NETDEV_MAX_BURST]; > + > + int nb_udp_rx = 0; > + int nb_other_rx = 0; > + > + /* Initialize GRO parameters */ > + gro_param.gro_types = RTE_GRO_TCP_IPV4 | > + RTE_GRO_UDP_IPV4 | > + RTE_GRO_IPV4_VXLAN_TCP_IPV4 | > + RTE_GRO_IPV4_VXLAN_UDP_IPV4; > + gro_param.max_flow_num = 1024; > + /* There are 46 fragments for a 64K big packet */ > + gro_param.max_item_per_flow = NETDEV_MAX_BURST * 2; > + > + /* Initialize GRO context */ > + if (RTE_PER_LCORE(_gro_ctx) == NULL) { > + uint32_t cpu, node; > + int ret; > + > + ret = syscall(__NR_getcpu, &cpu, &node, NULL); > + if (ret == 0) { > + gro_param.socket_id = node; > + } else { > + gro_param.socket_id = 0; > + } > + RTE_PER_LCORE(_gro_ctx) = rte_gro_ctx_create(&gro_param); > + } > > if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) { > return EAGAIN; > @@ -2890,7 +3058,58 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, > rte_spinlock_unlock(&dev->stats_lock); > } > > + /* Need to parse packet header and set necessary fields in mbuf for GRO */ > batch->count = nb_rx; > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + uint16_t l4_proto = 0; > + int is_frag = 0; > + > + netdev_dpdk_parse_hdr(packet, 0, &l4_proto, &is_frag); > + if (packet->mbuf.packet_type & RTE_PTYPE_TUNNEL_VXLAN) { > + if (packet->mbuf.packet_type & RTE_PTYPE_INNER_L4_UDP) { > + udp_pkts[nb_udp_rx++] = packet; > + } else { > + other_pkts[nb_other_rx++] = packet; > + } > + } else { > + if (packet->mbuf.packet_type & RTE_PTYPE_L4_UDP) { > + udp_pkts[nb_udp_rx++] = packet; > + } else { > + other_pkts[nb_other_rx++] = packet; > + } > + } > + } > + > + /* Do GRO here if needed, note: IP fragment can be out of order */ > + if (nb_udp_rx) { > + /* UDP packet must use heavy rte_gro_reassemble */ > + nb_udp_rx = rte_gro_reassemble((struct rte_mbuf **) udp_pkts, > + nb_udp_rx, RTE_PER_LCORE(_gro_ctx)); > + nb_udp_rx += rte_gro_timeout_flush(RTE_PER_LCORE(_gro_ctx), 10000, > + RTE_GRO_UDP_IPV4 > + | RTE_GRO_IPV4_VXLAN_UDP_IPV4, > + (struct rte_mbuf **)&udp_pkts[nb_udp_rx], > + NETDEV_MAX_BURST - nb_udp_rx); > + } > + > + if (nb_other_rx) { > + /* TCP packet is better for lightweigh rte_gro_reassemble_burst */ > + nb_other_rx = rte_gro_reassemble_burst((struct rte_mbuf **) other_pkts, > + nb_other_rx, > + &gro_param); > + } > + > + batch->count = nb_udp_rx + nb_other_rx; > + if (nb_udp_rx) { > + memcpy(batch->packets, udp_pkts, > + nb_udp_rx * sizeof(struct dp_packet *)); > + } > + > + if (nb_other_rx) { > + memcpy(&batch->packets[nb_udp_rx], other_pkts, > + nb_other_rx * sizeof(struct dp_packet *)); > + } > + > dp_packet_batch_init_packet_fields(batch); > > if (qfill) { > @@ -2931,10 +3150,11 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > for (i = 0; i < pkt_cnt; i++) { > pkt = pkts[i]; > if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len) > - && !(pkt->ol_flags & PKT_TX_TCP_SEG))) { > + && !(pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { > VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " " > - "max_packet_len %d", dev->up.name, pkt->pkt_len, > - dev->max_packet_len); > + "max_packet_len %d ol_flags 0x%016lx", > + dev->up.name, pkt->pkt_len, > + dev->max_packet_len, pkt->ol_flags); > rte_pktmbuf_free(pkt); > continue; > } > @@ -3289,7 +3509,6 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, > struct dp_packet_batch *batch, > bool concurrent_txq OVS_UNUSED) > { > - > if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { > dpdk_do_tx_copy(netdev, qid, batch); > dp_packet_delete_batch(batch, true); > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > index 557f139..d8a035a 100644 > --- a/lib/netdev-linux.c > +++ b/lib/netdev-linux.c > @@ -50,6 +50,7 @@ > #include <unistd.h> > > #include "coverage.h" > +#include "csum.h" > #include "dp-packet.h" > #include "dpif-netlink.h" > #include "dpif-netdev.h" > @@ -1549,8 +1550,31 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, > if (tso) { > netdev_linux_prepend_vnet_hdr(packet, mtu); > } > - > - iov[i].iov_base = dp_packet_data(packet); > + /* It is a GROed packet which has multiple segments, so need to merge > + * as a big packet in order that sendmmsg can handle it correctly. > + */ > + if (packet->mbuf.nb_segs > 1) { > + struct dp_packet *new_packet = > + dp_packet_new(dp_packet_size(packet)); > + struct rte_mbuf *next = (struct rte_mbuf *)packet; > + uint32_t offset = 0; > + > + iov[i].iov_base = dp_packet_data(new_packet); > + /* Copy multi-seg mbuf data to linear buffer */ > + while (next) { > + memcpy((uint8_t *)dp_packet_data(new_packet) + offset, > + rte_pktmbuf_mtod(next, char *), > + next->data_len); > + offset += next->data_len; > + next = next->next; > + } > + dp_packet_set_size(new_packet, offset); > + dp_packet_delete(packet); > + batch->packets[i] = new_packet; > + packet = new_packet; > + } else { > + iov[i].iov_base = dp_packet_data(packet); > + } > iov[i].iov_len = dp_packet_size(packet); > mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, > .msg_namelen = sizeof sll, > @@ -6624,17 +6648,93 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) > } > > static void > +netdev_linux_set_ol_flags_and_ip_cksum(struct dp_packet *b, int mtu) > +{ > + struct eth_header *eth_hdr; > + uint16_t l4proto = 0; > + ovs_be16 eth_type; > + int l2_len; > + > + eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); > + if (!eth_hdr) { > + return; > + } > + > + l2_len = ETH_HEADER_LEN; > + eth_type = eth_hdr->eth_type; > + if (eth_type_vlan(eth_type)) { > + struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN); > + > + if (!vlan) { > + return; > + } > + > + eth_type = vlan->vlan_next_type; > + l2_len += VLAN_HEADER_LEN; > + } > + > + if (eth_type == htons(ETH_TYPE_IP)) { > + struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN); > + > + if (!ip_hdr) { > + return; > + } > + > + ip_hdr->ip_csum = 0; > + ip_hdr->ip_csum = csum(ip_hdr, sizeof *ip_hdr); > + l4proto = ip_hdr->ip_proto; > + dp_packet_hwol_set_tx_ipv4(b); > + } else if (eth_type == htons(ETH_TYPE_IPV6)) { > + struct ovs_16aligned_ip6_hdr *nh6; > + > + nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN); > + if (!nh6) { > + return; > + } > + > + l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; > + dp_packet_hwol_set_tx_ipv6(b); > + } > + > + if (l4proto == IPPROTO_TCP) { > + /* Note: needn't set tcp checksum */ > + if (dp_packet_size(b) > mtu + b->mbuf.l2_len) { > + dp_packet_hwol_set_tcp_seg(b); > + } > + dp_packet_hwol_set_csum_tcp(b); > + } else if (l4proto == IPPROTO_UDP) { > + if (dp_packet_size(b) > mtu + b->mbuf.l2_len) { > + dp_packet_hwol_set_udp_seg(b); > + } > + dp_packet_hwol_set_csum_udp(b); > + } > +} > + > +static void > netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) > { > - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); > + struct virtio_net_hdr *vnet; > + > + /* ol_flags weren't set correctly for received packets which are from > + * physical port and GROed, so it has to been set again in order that > + * vnet_hdr can be prepended correctly. > + */ > + if ((dp_packet_size(b) > mtu + b->mbuf.l2_len) > + && !dp_packet_hwol_l4_mask(b)) { > + netdev_linux_set_ol_flags_and_ip_cksum(b, mtu); > + } > + > + vnet = dp_packet_push_zeros(b, sizeof *vnet); > > - if (dp_packet_hwol_is_tso(b)) { > + if (dp_packet_hwol_is_tso(b) || dp_packet_hwol_is_uso(b)) { > uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) > - + TCP_HEADER_LEN; > + + b->mbuf.l4_len; > > vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len; > vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); > - if (dp_packet_hwol_is_ipv4(b)) { > + if (dp_packet_hwol_is_uso(b)) { > + vnet->gso_type = VIRTIO_NET_HDR_GSO_UDP; > + } else if (dp_packet_hwol_is_ipv4(b)) { > vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; > } else { > vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; >
-----邮件原件----- 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets 发送时间: 2020年10月27日 21:12 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org 抄送: fbl@sysclose.org; i.maximets@ovn.org 主题: Re: [ovs-dev] [PATCH V3 3/4] Add VXLAN TCP and UDP GRO support for DPDK data path On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > From: Yi Yang <yangyi01@inspur.com> > > GRO(Generic Receive Offload) can help improve performance when TSO > (TCP Segment Offload) or VXLAN TSO is enabled on transmit side, this > can avoid overhead of ovs DPDK data path and enqueue vhost for VM by > merging many small packets to large packets (65535 bytes at most) once > it receives packets from physical NIC. IIUC, this patch allows multi-segment mbufs to float across different parts of OVS. This will definitely crash it somewhere. Much more changes all over the OVS required to make it safely work with such mbufs. There were few attempts to introduce this support, but all of them ended up being rejected. As it is this patch is not acceptable as it doesn't cover almost anything beside simple cases inside netdev implementation. Here is the latest attempt with multi-segment mbufs: https://patchwork.ozlabs.org/project/openvswitch/list/?series=130193&state=* Best regards, Ilya Maximets. [Yi Yang] We have to support this because we have supported TSO for TCP, it can't handle big UDP, this is why we must introduce GSO, the prerequisite of GSO is multi-segment must be enabled because GSOed mbufs are multi-segmented, but it is just last step before dpdk Tx, so I don't think it is an issue, per my test in our openstack environment, I didn't encounter any crash, this just enabled DPDK PMD driver to handle GSOed mbuf. For GRO, reassembling also use chained multi-segment mbuf to avoid copy, per long time test, it also didn't lead to any crash. We can fix some corner cases if they aren't covered. > > It can work for both VXLAN and vlan case. > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > --- > lib/dp-packet.h | 37 ++++++++- > lib/netdev-dpdk.c | 227 > ++++++++++++++++++++++++++++++++++++++++++++++++++++- > lib/netdev-linux.c | 112 ++++++++++++++++++++++++-- > 3 files changed, 365 insertions(+), 11 deletions(-) > > diff --git a/lib/dp-packet.h b/lib/dp-packet.h index c33868d..18307c0 > 100644 > --- a/lib/dp-packet.h > +++ b/lib/dp-packet.h > @@ -580,7 +580,16 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v) > * (and thus 'v') will always be <= UINT16_MAX; this means that there is no > * loss of accuracy in assigning 'v' to 'data_len'. > */ > - b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ > + if (b->mbuf.nb_segs <= 1) { > + b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ > + } else { > + /* For multi-seg packet, if it is resize, data_len should be > + * adjusted by offset, this will happend in case of push or pop. > + */ > + if (b->mbuf.pkt_len != 0) { > + b->mbuf.data_len += v - b->mbuf.pkt_len; > + } > + } > b->mbuf.pkt_len = v; /* Total length of all segments linked to > * this segment. */ } @@ > -1092,6 +1101,20 @@ dp_packet_hwol_set_l4_len(struct dp_packet *b, int > l4_len) { > b->mbuf.l4_len = l4_len; > } > + > +/* Set outer_l2_len for the packet 'b' */ static inline void > +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int > +outer_l2_len) { > + b->mbuf.outer_l2_len = outer_l2_len; } > + > +/* Set outer_l3_len for the packet 'b' */ static inline void > +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int > +outer_l3_len) { > + b->mbuf.outer_l3_len = outer_l3_len; } > #else > /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static > inline void @@ -1125,6 +1148,18 @@ dp_packet_hwol_set_l4_len(struct > dp_packet *b OVS_UNUSED, > int l4_len OVS_UNUSED) { } > + > +/* Set outer_l2_len for the packet 'b' */ static inline void > +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int > +outer_l2_len) { } > + > +/* Set outer_l3_len for the packet 'b' */ static inline void > +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int > +outer_l3_len) { } > #endif /* DPDK_NETDEV */ > > static inline bool > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c > index 888a45e..b6c57a6 100644 > --- a/lib/netdev-dpdk.c > +++ b/lib/netdev-dpdk.c > @@ -25,6 +25,7 @@ > #include <linux/virtio_net.h> > #include <sys/socket.h> > #include <linux/if.h> > +#include <linux/unistd.h> > > /* Include rte_compat.h first to allow experimental API's needed for the > * rte_meter.h rfc4115 functions. Once they are no longer marked as > @@ -47,6 +48,7 @@ > #include <rte_version.h> > #include <rte_vhost.h> > #include <rte_ip_frag.h> > +#include <rte_gro.h> > > #include "cmap.h" > #include "coverage.h" > @@ -2184,6 +2186,8 @@ get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) > } > } > > +#define UDP_VXLAN_ETH_HDR_SIZE 30 > + > /* Prepare the packet for HWOL. > * Return True if the packet is OK to continue. */ > static bool > @@ -2207,6 +2211,42 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) > return true; > } > > + /* ol_flags is cleaned after vxlan pop, so need reset for those packets. > + * Such packets are only for local VMs or namespaces, so need to return > + * after ol_flags, l2_len, l3_len and tso_segsz are set. > + */ > + if (((mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) == 0) && > + (mbuf->l2_len == UDP_VXLAN_ETH_HDR_SIZE) && > + (mbuf->pkt_len > 1464)) { > + mbuf->ol_flags = 0; > + mbuf->l2_len -= sizeof(struct udp_header) > + + sizeof(struct vxlanhdr); > + if (mbuf->l3_len == IP_HEADER_LEN) { > + mbuf->ol_flags |= PKT_TX_IPV4; > + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); > + l4_proto = ip_hdr->next_proto_id; > + } else if (mbuf->l3_len == IPV6_HEADER_LEN) { > + mbuf->ol_flags |= PKT_TX_IPV6; > + ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1); > + l4_proto = ip6_hdr->proto; > + } > + > + mbuf->ol_flags |= PKT_TX_IP_CKSUM; > + if (l4_proto == IPPROTO_TCP) { > + mbuf->ol_flags |= PKT_TX_TCP_SEG; > + mbuf->ol_flags |= PKT_TX_TCP_CKSUM; > + } else if (l4_proto == IPPROTO_UDP) { > + mbuf->ol_flags |= PKT_TX_UDP_SEG; > + mbuf->ol_flags |= PKT_TX_UDP_CKSUM; > + } > + mbuf->tso_segsz = 1450; > + if (mbuf->tso_segsz > dev->mtu) { > + mbuf->tso_segsz = dev->mtu; > + } > + > + return true; > + } > + > if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) { > /* Handle VXLAN TSO */ > struct rte_udp_hdr *udp_hdr; > @@ -2853,6 +2893,104 @@ netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq) > return dev->vhost_rxq_enabled[rxq->queue_id]; > } > > +#define VXLAN_DST_PORT 4789 > + > +static void > +netdev_dpdk_parse_hdr(struct dp_packet *pkt, int offset, uint16_t *l4_proto, > + int *is_frag) > +{ > + struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt; > + struct rte_ether_hdr *eth_hdr = > + rte_pktmbuf_mtod_offset(mbuf, struct rte_ether_hdr *, offset); > + ovs_be16 eth_type; > + int l2_len; > + int l3_len = 0; > + int l4_len = 0; > + uint16_t inner_l4_proto = 0; > + int inner_is_frag = 0; > + > + if (offset == 0) { > + *is_frag = 0; > + } > + mbuf->packet_type = 0; > + l2_len = ETH_HEADER_LEN; > + eth_type = (OVS_FORCE ovs_be16) eth_hdr->ether_type; > + if (eth_type_vlan(eth_type)) { > + struct rte_vlan_hdr *vlan_hdr = > + (struct rte_vlan_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET); > + > + eth_type = (OVS_FORCE ovs_be16) vlan_hdr->eth_proto; > + l2_len += VLAN_HEADER_LEN; > + } > + > + dp_packet_hwol_set_l2_len(pkt, l2_len); > + dp_packet_hwol_set_outer_l2_len(pkt, 0); > + dp_packet_hwol_set_outer_l3_len(pkt, 0); > + > + if (eth_type == htons(ETH_TYPE_IP)) { > + struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *) > + ((char *)eth_hdr + l2_len); > + > + l3_len = IP_HEADER_LEN; > + dp_packet_hwol_set_tx_ipv4(pkt); > + *l4_proto = ipv4_hdr->next_proto_id; > + *is_frag = rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr); > + mbuf->packet_type |= RTE_PTYPE_L3_IPV4; > + } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) { > + struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *) > + ((char *)eth_hdr + l2_len); > + l3_len = IPV6_HEADER_LEN; > + dp_packet_hwol_set_tx_ipv6(pkt); > + *l4_proto = ipv6_hdr->proto; > + } > + > + dp_packet_hwol_set_l3_len(pkt, l3_len); > + > + if (*l4_proto == IPPROTO_TCP) { > + struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *) > + ((char *)eth_hdr + l2_len + l3_len); > + > + l4_len = (tcp_hdr->data_off & 0xf0) >> 2; > + dp_packet_hwol_set_l4_len(pkt, l4_len); > + mbuf->packet_type |= RTE_PTYPE_L4_TCP; > + } else if (*l4_proto == IPPROTO_UDP) { > + struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *) > + ((char *)eth_hdr + l2_len + l3_len); > + > + l4_len = sizeof(*udp_hdr); > + dp_packet_hwol_set_l4_len(pkt, l4_len); > + mbuf->packet_type |= RTE_PTYPE_L4_UDP; > + > + /* Need to parse inner packet if needed */ > + if (ntohs(udp_hdr->dst_port) == VXLAN_DST_PORT) { > + netdev_dpdk_parse_hdr(pkt, > + l2_len + l3_len + l4_len > + + sizeof(struct vxlanhdr), > + &inner_l4_proto, > + &inner_is_frag); > + mbuf->l2_len += sizeof(struct rte_udp_hdr) > + + sizeof(struct vxlanhdr); > + dp_packet_hwol_set_outer_l2_len(pkt, l2_len); > + dp_packet_hwol_set_outer_l3_len(pkt, l3_len); > + > + /* Set packet_type, it is necessary for GRO */ > + mbuf->packet_type |= RTE_PTYPE_TUNNEL_VXLAN; > + if (mbuf->l3_len == IP_HEADER_LEN) { > + mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4; > + } > + if (inner_l4_proto == IPPROTO_TCP) { > + mbuf->packet_type |= RTE_PTYPE_INNER_L4_TCP; > + mbuf->packet_type |= RTE_PTYPE_L4_UDP; > + } else if (inner_l4_proto == IPPROTO_UDP) { > + mbuf->packet_type |= RTE_PTYPE_INNER_L4_UDP; > + mbuf->packet_type |= RTE_PTYPE_L4_UDP; > + } > + } > + } > +} > + > +static RTE_DEFINE_PER_LCORE(void *, _gro_ctx); > + > static int > netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, > int *qfill) > @@ -2862,6 +3000,36 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, > struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev); > int nb_rx; > int dropped = 0; > + struct rte_gro_param gro_param; > + struct dp_packet *packet; > + struct dp_packet *udp_pkts[NETDEV_MAX_BURST]; > + struct dp_packet *other_pkts[NETDEV_MAX_BURST]; > + > + int nb_udp_rx = 0; > + int nb_other_rx = 0; > + > + /* Initialize GRO parameters */ > + gro_param.gro_types = RTE_GRO_TCP_IPV4 | > + RTE_GRO_UDP_IPV4 | > + RTE_GRO_IPV4_VXLAN_TCP_IPV4 | > + RTE_GRO_IPV4_VXLAN_UDP_IPV4; > + gro_param.max_flow_num = 1024; > + /* There are 46 fragments for a 64K big packet */ > + gro_param.max_item_per_flow = NETDEV_MAX_BURST * 2; > + > + /* Initialize GRO context */ > + if (RTE_PER_LCORE(_gro_ctx) == NULL) { > + uint32_t cpu, node; > + int ret; > + > + ret = syscall(__NR_getcpu, &cpu, &node, NULL); > + if (ret == 0) { > + gro_param.socket_id = node; > + } else { > + gro_param.socket_id = 0; > + } > + RTE_PER_LCORE(_gro_ctx) = rte_gro_ctx_create(&gro_param); > + } > > if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) { > return EAGAIN; > @@ -2890,7 +3058,58 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, > rte_spinlock_unlock(&dev->stats_lock); > } > > + /* Need to parse packet header and set necessary fields in mbuf for GRO */ > batch->count = nb_rx; > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + uint16_t l4_proto = 0; > + int is_frag = 0; > + > + netdev_dpdk_parse_hdr(packet, 0, &l4_proto, &is_frag); > + if (packet->mbuf.packet_type & RTE_PTYPE_TUNNEL_VXLAN) { > + if (packet->mbuf.packet_type & RTE_PTYPE_INNER_L4_UDP) { > + udp_pkts[nb_udp_rx++] = packet; > + } else { > + other_pkts[nb_other_rx++] = packet; > + } > + } else { > + if (packet->mbuf.packet_type & RTE_PTYPE_L4_UDP) { > + udp_pkts[nb_udp_rx++] = packet; > + } else { > + other_pkts[nb_other_rx++] = packet; > + } > + } > + } > + > + /* Do GRO here if needed, note: IP fragment can be out of order */ > + if (nb_udp_rx) { > + /* UDP packet must use heavy rte_gro_reassemble */ > + nb_udp_rx = rte_gro_reassemble((struct rte_mbuf **) udp_pkts, > + nb_udp_rx, RTE_PER_LCORE(_gro_ctx)); > + nb_udp_rx += rte_gro_timeout_flush(RTE_PER_LCORE(_gro_ctx), 10000, > + RTE_GRO_UDP_IPV4 > + | RTE_GRO_IPV4_VXLAN_UDP_IPV4, > + (struct rte_mbuf **)&udp_pkts[nb_udp_rx], > + NETDEV_MAX_BURST - nb_udp_rx); > + } > + > + if (nb_other_rx) { > + /* TCP packet is better for lightweigh rte_gro_reassemble_burst */ > + nb_other_rx = rte_gro_reassemble_burst((struct rte_mbuf **) other_pkts, > + nb_other_rx, > + &gro_param); > + } > + > + batch->count = nb_udp_rx + nb_other_rx; > + if (nb_udp_rx) { > + memcpy(batch->packets, udp_pkts, > + nb_udp_rx * sizeof(struct dp_packet *)); > + } > + > + if (nb_other_rx) { > + memcpy(&batch->packets[nb_udp_rx], other_pkts, > + nb_other_rx * sizeof(struct dp_packet *)); > + } > + > dp_packet_batch_init_packet_fields(batch); > > if (qfill) { > @@ -2931,10 +3150,11 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, > for (i = 0; i < pkt_cnt; i++) { > pkt = pkts[i]; > if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len) > - && !(pkt->ol_flags & PKT_TX_TCP_SEG))) { > + && !(pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { > VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " " > - "max_packet_len %d", dev->up.name, pkt->pkt_len, > - dev->max_packet_len); > + "max_packet_len %d ol_flags 0x%016lx", > + dev->up.name, pkt->pkt_len, > + dev->max_packet_len, pkt->ol_flags); > rte_pktmbuf_free(pkt); > continue; > } > @@ -3289,7 +3509,6 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, > struct dp_packet_batch *batch, > bool concurrent_txq OVS_UNUSED) > { > - > if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { > dpdk_do_tx_copy(netdev, qid, batch); > dp_packet_delete_batch(batch, true); > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > index 557f139..d8a035a 100644 > --- a/lib/netdev-linux.c > +++ b/lib/netdev-linux.c > @@ -50,6 +50,7 @@ > #include <unistd.h> > > #include "coverage.h" > +#include "csum.h" > #include "dp-packet.h" > #include "dpif-netlink.h" > #include "dpif-netdev.h" > @@ -1549,8 +1550,31 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, > if (tso) { > netdev_linux_prepend_vnet_hdr(packet, mtu); > } > - > - iov[i].iov_base = dp_packet_data(packet); > + /* It is a GROed packet which has multiple segments, so need to merge > + * as a big packet in order that sendmmsg can handle it correctly. > + */ > + if (packet->mbuf.nb_segs > 1) { > + struct dp_packet *new_packet = > + dp_packet_new(dp_packet_size(packet)); > + struct rte_mbuf *next = (struct rte_mbuf *)packet; > + uint32_t offset = 0; > + > + iov[i].iov_base = dp_packet_data(new_packet); > + /* Copy multi-seg mbuf data to linear buffer */ > + while (next) { > + memcpy((uint8_t *)dp_packet_data(new_packet) + offset, > + rte_pktmbuf_mtod(next, char *), > + next->data_len); > + offset += next->data_len; > + next = next->next; > + } > + dp_packet_set_size(new_packet, offset); > + dp_packet_delete(packet); > + batch->packets[i] = new_packet; > + packet = new_packet; > + } else { > + iov[i].iov_base = dp_packet_data(packet); > + } > iov[i].iov_len = dp_packet_size(packet); > mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, > .msg_namelen = sizeof sll, > @@ -6624,17 +6648,93 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) > } > > static void > +netdev_linux_set_ol_flags_and_ip_cksum(struct dp_packet *b, int mtu) > +{ > + struct eth_header *eth_hdr; > + uint16_t l4proto = 0; > + ovs_be16 eth_type; > + int l2_len; > + > + eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); > + if (!eth_hdr) { > + return; > + } > + > + l2_len = ETH_HEADER_LEN; > + eth_type = eth_hdr->eth_type; > + if (eth_type_vlan(eth_type)) { > + struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN); > + > + if (!vlan) { > + return; > + } > + > + eth_type = vlan->vlan_next_type; > + l2_len += VLAN_HEADER_LEN; > + } > + > + if (eth_type == htons(ETH_TYPE_IP)) { > + struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN); > + > + if (!ip_hdr) { > + return; > + } > + > + ip_hdr->ip_csum = 0; > + ip_hdr->ip_csum = csum(ip_hdr, sizeof *ip_hdr); > + l4proto = ip_hdr->ip_proto; > + dp_packet_hwol_set_tx_ipv4(b); > + } else if (eth_type == htons(ETH_TYPE_IPV6)) { > + struct ovs_16aligned_ip6_hdr *nh6; > + > + nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN); > + if (!nh6) { > + return; > + } > + > + l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; > + dp_packet_hwol_set_tx_ipv6(b); > + } > + > + if (l4proto == IPPROTO_TCP) { > + /* Note: needn't set tcp checksum */ > + if (dp_packet_size(b) > mtu + b->mbuf.l2_len) { > + dp_packet_hwol_set_tcp_seg(b); > + } > + dp_packet_hwol_set_csum_tcp(b); > + } else if (l4proto == IPPROTO_UDP) { > + if (dp_packet_size(b) > mtu + b->mbuf.l2_len) { > + dp_packet_hwol_set_udp_seg(b); > + } > + dp_packet_hwol_set_csum_udp(b); > + } > +} > + > +static void > netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) > { > - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); > + struct virtio_net_hdr *vnet; > + > + /* ol_flags weren't set correctly for received packets which are from > + * physical port and GROed, so it has to been set again in order that > + * vnet_hdr can be prepended correctly. > + */ > + if ((dp_packet_size(b) > mtu + b->mbuf.l2_len) > + && !dp_packet_hwol_l4_mask(b)) { > + netdev_linux_set_ol_flags_and_ip_cksum(b, mtu); > + } > + > + vnet = dp_packet_push_zeros(b, sizeof *vnet); > > - if (dp_packet_hwol_is_tso(b)) { > + if (dp_packet_hwol_is_tso(b) || dp_packet_hwol_is_uso(b)) { > uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) > - + TCP_HEADER_LEN; > + + b->mbuf.l4_len; > > vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len; > vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); > - if (dp_packet_hwol_is_ipv4(b)) { > + if (dp_packet_hwol_is_uso(b)) { > + vnet->gso_type = VIRTIO_NET_HDR_GSO_UDP; > + } else if (dp_packet_hwol_is_ipv4(b)) { > vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; > } else { > vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; >
On Tue, Oct 27, 2020 at 5:50 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > -----邮件原件----- > 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets > 发送时间: 2020年10月27日 21:12 > 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org > 抄送: fbl@sysclose.org; i.maximets@ovn.org > 主题: Re: [ovs-dev] [PATCH V3 3/4] Add VXLAN TCP and UDP GRO support for DPDK data path > > On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > > From: Yi Yang <yangyi01@inspur.com> > > > > GRO(Generic Receive Offload) can help improve performance when TSO > > (TCP Segment Offload) or VXLAN TSO is enabled on transmit side, this > > can avoid overhead of ovs DPDK data path and enqueue vhost for VM by > > merging many small packets to large packets (65535 bytes at most) once > > it receives packets from physical NIC. > > IIUC, this patch allows multi-segment mbufs to float across different parts of OVS. This will definitely crash it somewhere. Much more changes all over the OVS required to make it safely work with such mbufs. There were few attempts to introduce this support, but all of them ended up being rejected. As it is this patch is not acceptable as it doesn't cover almost anything beside simple cases inside netdev implementation. > > Here is the latest attempt with multi-segment mbufs: > https://patchwork.ozlabs.org/project/openvswitch/list/?series=130193&state=* Thanks, that's very helpful. Looks like a huge amount of work to introduce multi-seg mbuf. > > Best regards, Ilya Maximets. > > [Yi Yang] We have to support this because we have supported TSO for TCP, it can't handle big UDP, this is why we must introduce GSO, the prerequisite of GSO is multi-segment must be enabled because GSOed mbufs are multi-segmented, but it is just last step before dpdk Tx, so I don't think it is an issue, per my test in our openstack environment, I didn't encounter any crash, this just enabled DPDK PMD driver to handle GSOed mbuf. For GRO, reassembling also use chained multi-segment mbuf to avoid copy, per long time test, it also didn't lead to any crash. We can fix some corner cases if they aren't covered. > I just started to understand the problem. Sorry if I missed something. So currently what do we do to prevent DPDK sending OVS using multi-seg mbuf? Do we check it and linearize the mbuf? Can we make GSO/GRO working using linearized mbuf? Regards, William
I have sent v4 patch to remove GRO and GSO code to avoid such concern. GRO and GSO must use multi-seg mbuf, our local openstack environment has used these code but didn't see any corrupt issue, so I'm not sure what case will result in corrupt. Linearizing mbuf means copying, that will have huge impact on performance. -----邮件原件----- 发件人: William Tu [mailto:u9012063@gmail.com] 发送时间: 2021年2月7日 0:02 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> 抄送: i.maximets@ovn.org; yang_y_yi@163.com; ovs-dev@openvswitch.org; fbl@sysclose.org 主题: Re: [ovs-dev] 答复: [PATCH V3 3/4] Add VXLAN TCP and UDP GRO support for DPDK data path On Tue, Oct 27, 2020 at 5:50 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > -----邮件原件----- > 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 Ilya Maximets > 发送时间: 2020年10月27日 21:12 > 收件人: yang_y_yi@163.com; ovs-dev@openvswitch.org > 抄送: fbl@sysclose.org; i.maximets@ovn.org > 主题: Re: [ovs-dev] [PATCH V3 3/4] Add VXLAN TCP and UDP GRO support for > DPDK data path > > On 8/7/20 12:56 PM, yang_y_yi@163.com wrote: > > From: Yi Yang <yangyi01@inspur.com> > > > > GRO(Generic Receive Offload) can help improve performance when TSO > > (TCP Segment Offload) or VXLAN TSO is enabled on transmit side, this > > can avoid overhead of ovs DPDK data path and enqueue vhost for VM by > > merging many small packets to large packets (65535 bytes at most) > > once it receives packets from physical NIC. > > IIUC, this patch allows multi-segment mbufs to float across different parts of OVS. This will definitely crash it somewhere. Much more changes all over the OVS required to make it safely work with such mbufs. There were few attempts to introduce this support, but all of them ended up being rejected. As it is this patch is not acceptable as it doesn't cover almost anything beside simple cases inside netdev implementation. > > Here is the latest attempt with multi-segment mbufs: > https://patchwork.ozlabs.org/project/openvswitch/list/?series=130193&s > tate=* Thanks, that's very helpful. Looks like a huge amount of work to introduce multi-seg mbuf. > > Best regards, Ilya Maximets. > > [Yi Yang] We have to support this because we have supported TSO for TCP, it can't handle big UDP, this is why we must introduce GSO, the prerequisite of GSO is multi-segment must be enabled because GSOed mbufs are multi-segmented, but it is just last step before dpdk Tx, so I don't think it is an issue, per my test in our openstack environment, I didn't encounter any crash, this just enabled DPDK PMD driver to handle GSOed mbuf. For GRO, reassembling also use chained multi-segment mbuf to avoid copy, per long time test, it also didn't lead to any crash. We can fix some corner cases if they aren't covered. > I just started to understand the problem. Sorry if I missed something. So currently what do we do to prevent DPDK sending OVS using multi-seg mbuf? Do we check it and linearize the mbuf? Can we make GSO/GRO working using linearized mbuf? Regards, William
diff --git a/lib/dp-packet.h b/lib/dp-packet.h index c33868d..18307c0 100644 --- a/lib/dp-packet.h +++ b/lib/dp-packet.h @@ -580,7 +580,16 @@ dp_packet_set_size(struct dp_packet *b, uint32_t v) * (and thus 'v') will always be <= UINT16_MAX; this means that there is no * loss of accuracy in assigning 'v' to 'data_len'. */ - b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ + if (b->mbuf.nb_segs <= 1) { + b->mbuf.data_len = (uint16_t)v; /* Current seg length. */ + } else { + /* For multi-seg packet, if it is resize, data_len should be + * adjusted by offset, this will happend in case of push or pop. + */ + if (b->mbuf.pkt_len != 0) { + b->mbuf.data_len += v - b->mbuf.pkt_len; + } + } b->mbuf.pkt_len = v; /* Total length of all segments linked to * this segment. */ } @@ -1092,6 +1101,20 @@ dp_packet_hwol_set_l4_len(struct dp_packet *b, int l4_len) { b->mbuf.l4_len = l4_len; } + +/* Set outer_l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len) +{ + b->mbuf.outer_l2_len = outer_l2_len; +} + +/* Set outer_l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len) +{ + b->mbuf.outer_l3_len = outer_l3_len; +} #else /* Mark packet 'b' for VXLAN TCP segmentation offloading. */ static inline void @@ -1125,6 +1148,18 @@ dp_packet_hwol_set_l4_len(struct dp_packet *b OVS_UNUSED, int l4_len OVS_UNUSED) { } + +/* Set outer_l2_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l2_len(struct dp_packet *b, int outer_l2_len) +{ +} + +/* Set outer_l3_len for the packet 'b' */ +static inline void +dp_packet_hwol_set_outer_l3_len(struct dp_packet *b, int outer_l3_len) +{ +} #endif /* DPDK_NETDEV */ static inline bool diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 888a45e..b6c57a6 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -25,6 +25,7 @@ #include <linux/virtio_net.h> #include <sys/socket.h> #include <linux/if.h> +#include <linux/unistd.h> /* Include rte_compat.h first to allow experimental API's needed for the * rte_meter.h rfc4115 functions. Once they are no longer marked as @@ -47,6 +48,7 @@ #include <rte_version.h> #include <rte_vhost.h> #include <rte_ip_frag.h> +#include <rte_gro.h> #include "cmap.h" #include "coverage.h" @@ -2184,6 +2186,8 @@ get_udptcp_checksum(void *l3_hdr, void *l4_hdr, uint16_t ethertype) } } +#define UDP_VXLAN_ETH_HDR_SIZE 30 + /* Prepare the packet for HWOL. * Return True if the packet is OK to continue. */ static bool @@ -2207,6 +2211,42 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, struct rte_mbuf *mbuf) return true; } + /* ol_flags is cleaned after vxlan pop, so need reset for those packets. + * Such packets are only for local VMs or namespaces, so need to return + * after ol_flags, l2_len, l3_len and tso_segsz are set. + */ + if (((mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) == 0) && + (mbuf->l2_len == UDP_VXLAN_ETH_HDR_SIZE) && + (mbuf->pkt_len > 1464)) { + mbuf->ol_flags = 0; + mbuf->l2_len -= sizeof(struct udp_header) + + sizeof(struct vxlanhdr); + if (mbuf->l3_len == IP_HEADER_LEN) { + mbuf->ol_flags |= PKT_TX_IPV4; + ip_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); + l4_proto = ip_hdr->next_proto_id; + } else if (mbuf->l3_len == IPV6_HEADER_LEN) { + mbuf->ol_flags |= PKT_TX_IPV6; + ip6_hdr = (struct rte_ipv6_hdr *)(eth_hdr + 1); + l4_proto = ip6_hdr->proto; + } + + mbuf->ol_flags |= PKT_TX_IP_CKSUM; + if (l4_proto == IPPROTO_TCP) { + mbuf->ol_flags |= PKT_TX_TCP_SEG; + mbuf->ol_flags |= PKT_TX_TCP_CKSUM; + } else if (l4_proto == IPPROTO_UDP) { + mbuf->ol_flags |= PKT_TX_UDP_SEG; + mbuf->ol_flags |= PKT_TX_UDP_CKSUM; + } + mbuf->tso_segsz = 1450; + if (mbuf->tso_segsz > dev->mtu) { + mbuf->tso_segsz = dev->mtu; + } + + return true; + } + if (mbuf->ol_flags & PKT_TX_TUNNEL_VXLAN) { /* Handle VXLAN TSO */ struct rte_udp_hdr *udp_hdr; @@ -2853,6 +2893,104 @@ netdev_dpdk_vhost_rxq_enabled(struct netdev_rxq *rxq) return dev->vhost_rxq_enabled[rxq->queue_id]; } +#define VXLAN_DST_PORT 4789 + +static void +netdev_dpdk_parse_hdr(struct dp_packet *pkt, int offset, uint16_t *l4_proto, + int *is_frag) +{ + struct rte_mbuf *mbuf = (struct rte_mbuf *)pkt; + struct rte_ether_hdr *eth_hdr = + rte_pktmbuf_mtod_offset(mbuf, struct rte_ether_hdr *, offset); + ovs_be16 eth_type; + int l2_len; + int l3_len = 0; + int l4_len = 0; + uint16_t inner_l4_proto = 0; + int inner_is_frag = 0; + + if (offset == 0) { + *is_frag = 0; + } + mbuf->packet_type = 0; + l2_len = ETH_HEADER_LEN; + eth_type = (OVS_FORCE ovs_be16) eth_hdr->ether_type; + if (eth_type_vlan(eth_type)) { + struct rte_vlan_hdr *vlan_hdr = + (struct rte_vlan_hdr *)(eth_hdr + DPDK_RTE_HDR_OFFSET); + + eth_type = (OVS_FORCE ovs_be16) vlan_hdr->eth_proto; + l2_len += VLAN_HEADER_LEN; + } + + dp_packet_hwol_set_l2_len(pkt, l2_len); + dp_packet_hwol_set_outer_l2_len(pkt, 0); + dp_packet_hwol_set_outer_l3_len(pkt, 0); + + if (eth_type == htons(ETH_TYPE_IP)) { + struct rte_ipv4_hdr *ipv4_hdr = (struct rte_ipv4_hdr *) + ((char *)eth_hdr + l2_len); + + l3_len = IP_HEADER_LEN; + dp_packet_hwol_set_tx_ipv4(pkt); + *l4_proto = ipv4_hdr->next_proto_id; + *is_frag = rte_ipv4_frag_pkt_is_fragmented(ipv4_hdr); + mbuf->packet_type |= RTE_PTYPE_L3_IPV4; + } else if (eth_type == htons(RTE_ETHER_TYPE_IPV6)) { + struct rte_ipv6_hdr *ipv6_hdr = (struct rte_ipv6_hdr *) + ((char *)eth_hdr + l2_len); + l3_len = IPV6_HEADER_LEN; + dp_packet_hwol_set_tx_ipv6(pkt); + *l4_proto = ipv6_hdr->proto; + } + + dp_packet_hwol_set_l3_len(pkt, l3_len); + + if (*l4_proto == IPPROTO_TCP) { + struct rte_tcp_hdr *tcp_hdr = (struct rte_tcp_hdr *) + ((char *)eth_hdr + l2_len + l3_len); + + l4_len = (tcp_hdr->data_off & 0xf0) >> 2; + dp_packet_hwol_set_l4_len(pkt, l4_len); + mbuf->packet_type |= RTE_PTYPE_L4_TCP; + } else if (*l4_proto == IPPROTO_UDP) { + struct rte_udp_hdr *udp_hdr = (struct rte_udp_hdr *) + ((char *)eth_hdr + l2_len + l3_len); + + l4_len = sizeof(*udp_hdr); + dp_packet_hwol_set_l4_len(pkt, l4_len); + mbuf->packet_type |= RTE_PTYPE_L4_UDP; + + /* Need to parse inner packet if needed */ + if (ntohs(udp_hdr->dst_port) == VXLAN_DST_PORT) { + netdev_dpdk_parse_hdr(pkt, + l2_len + l3_len + l4_len + + sizeof(struct vxlanhdr), + &inner_l4_proto, + &inner_is_frag); + mbuf->l2_len += sizeof(struct rte_udp_hdr) + + sizeof(struct vxlanhdr); + dp_packet_hwol_set_outer_l2_len(pkt, l2_len); + dp_packet_hwol_set_outer_l3_len(pkt, l3_len); + + /* Set packet_type, it is necessary for GRO */ + mbuf->packet_type |= RTE_PTYPE_TUNNEL_VXLAN; + if (mbuf->l3_len == IP_HEADER_LEN) { + mbuf->packet_type |= RTE_PTYPE_INNER_L3_IPV4; + } + if (inner_l4_proto == IPPROTO_TCP) { + mbuf->packet_type |= RTE_PTYPE_INNER_L4_TCP; + mbuf->packet_type |= RTE_PTYPE_L4_UDP; + } else if (inner_l4_proto == IPPROTO_UDP) { + mbuf->packet_type |= RTE_PTYPE_INNER_L4_UDP; + mbuf->packet_type |= RTE_PTYPE_L4_UDP; + } + } + } +} + +static RTE_DEFINE_PER_LCORE(void *, _gro_ctx); + static int netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, int *qfill) @@ -2862,6 +3000,36 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, struct ingress_policer *policer = netdev_dpdk_get_ingress_policer(dev); int nb_rx; int dropped = 0; + struct rte_gro_param gro_param; + struct dp_packet *packet; + struct dp_packet *udp_pkts[NETDEV_MAX_BURST]; + struct dp_packet *other_pkts[NETDEV_MAX_BURST]; + + int nb_udp_rx = 0; + int nb_other_rx = 0; + + /* Initialize GRO parameters */ + gro_param.gro_types = RTE_GRO_TCP_IPV4 | + RTE_GRO_UDP_IPV4 | + RTE_GRO_IPV4_VXLAN_TCP_IPV4 | + RTE_GRO_IPV4_VXLAN_UDP_IPV4; + gro_param.max_flow_num = 1024; + /* There are 46 fragments for a 64K big packet */ + gro_param.max_item_per_flow = NETDEV_MAX_BURST * 2; + + /* Initialize GRO context */ + if (RTE_PER_LCORE(_gro_ctx) == NULL) { + uint32_t cpu, node; + int ret; + + ret = syscall(__NR_getcpu, &cpu, &node, NULL); + if (ret == 0) { + gro_param.socket_id = node; + } else { + gro_param.socket_id = 0; + } + RTE_PER_LCORE(_gro_ctx) = rte_gro_ctx_create(&gro_param); + } if (OVS_UNLIKELY(!(dev->flags & NETDEV_UP))) { return EAGAIN; @@ -2890,7 +3058,58 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct dp_packet_batch *batch, rte_spinlock_unlock(&dev->stats_lock); } + /* Need to parse packet header and set necessary fields in mbuf for GRO */ batch->count = nb_rx; + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + uint16_t l4_proto = 0; + int is_frag = 0; + + netdev_dpdk_parse_hdr(packet, 0, &l4_proto, &is_frag); + if (packet->mbuf.packet_type & RTE_PTYPE_TUNNEL_VXLAN) { + if (packet->mbuf.packet_type & RTE_PTYPE_INNER_L4_UDP) { + udp_pkts[nb_udp_rx++] = packet; + } else { + other_pkts[nb_other_rx++] = packet; + } + } else { + if (packet->mbuf.packet_type & RTE_PTYPE_L4_UDP) { + udp_pkts[nb_udp_rx++] = packet; + } else { + other_pkts[nb_other_rx++] = packet; + } + } + } + + /* Do GRO here if needed, note: IP fragment can be out of order */ + if (nb_udp_rx) { + /* UDP packet must use heavy rte_gro_reassemble */ + nb_udp_rx = rte_gro_reassemble((struct rte_mbuf **) udp_pkts, + nb_udp_rx, RTE_PER_LCORE(_gro_ctx)); + nb_udp_rx += rte_gro_timeout_flush(RTE_PER_LCORE(_gro_ctx), 10000, + RTE_GRO_UDP_IPV4 + | RTE_GRO_IPV4_VXLAN_UDP_IPV4, + (struct rte_mbuf **)&udp_pkts[nb_udp_rx], + NETDEV_MAX_BURST - nb_udp_rx); + } + + if (nb_other_rx) { + /* TCP packet is better for lightweigh rte_gro_reassemble_burst */ + nb_other_rx = rte_gro_reassemble_burst((struct rte_mbuf **) other_pkts, + nb_other_rx, + &gro_param); + } + + batch->count = nb_udp_rx + nb_other_rx; + if (nb_udp_rx) { + memcpy(batch->packets, udp_pkts, + nb_udp_rx * sizeof(struct dp_packet *)); + } + + if (nb_other_rx) { + memcpy(&batch->packets[nb_udp_rx], other_pkts, + nb_other_rx * sizeof(struct dp_packet *)); + } + dp_packet_batch_init_packet_fields(batch); if (qfill) { @@ -2931,10 +3150,11 @@ netdev_dpdk_filter_packet_len(struct netdev_dpdk *dev, struct rte_mbuf **pkts, for (i = 0; i < pkt_cnt; i++) { pkt = pkts[i]; if (OVS_UNLIKELY((pkt->pkt_len > dev->max_packet_len) - && !(pkt->ol_flags & PKT_TX_TCP_SEG))) { + && !(pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_UDP_SEG)))) { VLOG_WARN_RL(&rl, "%s: Too big size %" PRIu32 " " - "max_packet_len %d", dev->up.name, pkt->pkt_len, - dev->max_packet_len); + "max_packet_len %d ol_flags 0x%016lx", + dev->up.name, pkt->pkt_len, + dev->max_packet_len, pkt->ol_flags); rte_pktmbuf_free(pkt); continue; } @@ -3289,7 +3509,6 @@ netdev_dpdk_vhost_send(struct netdev *netdev, int qid, struct dp_packet_batch *batch, bool concurrent_txq OVS_UNUSED) { - if (OVS_UNLIKELY(batch->packets[0]->source != DPBUF_DPDK)) { dpdk_do_tx_copy(netdev, qid, batch); dp_packet_delete_batch(batch, true); diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index 557f139..d8a035a 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -50,6 +50,7 @@ #include <unistd.h> #include "coverage.h" +#include "csum.h" #include "dp-packet.h" #include "dpif-netlink.h" #include "dpif-netdev.h" @@ -1549,8 +1550,31 @@ netdev_linux_sock_batch_send(int sock, int ifindex, bool tso, int mtu, if (tso) { netdev_linux_prepend_vnet_hdr(packet, mtu); } - - iov[i].iov_base = dp_packet_data(packet); + /* It is a GROed packet which has multiple segments, so need to merge + * as a big packet in order that sendmmsg can handle it correctly. + */ + if (packet->mbuf.nb_segs > 1) { + struct dp_packet *new_packet = + dp_packet_new(dp_packet_size(packet)); + struct rte_mbuf *next = (struct rte_mbuf *)packet; + uint32_t offset = 0; + + iov[i].iov_base = dp_packet_data(new_packet); + /* Copy multi-seg mbuf data to linear buffer */ + while (next) { + memcpy((uint8_t *)dp_packet_data(new_packet) + offset, + rte_pktmbuf_mtod(next, char *), + next->data_len); + offset += next->data_len; + next = next->next; + } + dp_packet_set_size(new_packet, offset); + dp_packet_delete(packet); + batch->packets[i] = new_packet; + packet = new_packet; + } else { + iov[i].iov_base = dp_packet_data(packet); + } iov[i].iov_len = dp_packet_size(packet); mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, .msg_namelen = sizeof sll, @@ -6624,17 +6648,93 @@ netdev_linux_parse_vnet_hdr(struct dp_packet *b) } static void +netdev_linux_set_ol_flags_and_ip_cksum(struct dp_packet *b, int mtu) +{ + struct eth_header *eth_hdr; + uint16_t l4proto = 0; + ovs_be16 eth_type; + int l2_len; + + eth_hdr = dp_packet_at(b, 0, ETH_HEADER_LEN); + if (!eth_hdr) { + return; + } + + l2_len = ETH_HEADER_LEN; + eth_type = eth_hdr->eth_type; + if (eth_type_vlan(eth_type)) { + struct vlan_header *vlan = dp_packet_at(b, l2_len, VLAN_HEADER_LEN); + + if (!vlan) { + return; + } + + eth_type = vlan->vlan_next_type; + l2_len += VLAN_HEADER_LEN; + } + + if (eth_type == htons(ETH_TYPE_IP)) { + struct ip_header *ip_hdr = dp_packet_at(b, l2_len, IP_HEADER_LEN); + + if (!ip_hdr) { + return; + } + + ip_hdr->ip_csum = 0; + ip_hdr->ip_csum = csum(ip_hdr, sizeof *ip_hdr); + l4proto = ip_hdr->ip_proto; + dp_packet_hwol_set_tx_ipv4(b); + } else if (eth_type == htons(ETH_TYPE_IPV6)) { + struct ovs_16aligned_ip6_hdr *nh6; + + nh6 = dp_packet_at(b, l2_len, IPV6_HEADER_LEN); + if (!nh6) { + return; + } + + l4proto = nh6->ip6_ctlun.ip6_un1.ip6_un1_nxt; + dp_packet_hwol_set_tx_ipv6(b); + } + + if (l4proto == IPPROTO_TCP) { + /* Note: needn't set tcp checksum */ + if (dp_packet_size(b) > mtu + b->mbuf.l2_len) { + dp_packet_hwol_set_tcp_seg(b); + } + dp_packet_hwol_set_csum_tcp(b); + } else if (l4proto == IPPROTO_UDP) { + if (dp_packet_size(b) > mtu + b->mbuf.l2_len) { + dp_packet_hwol_set_udp_seg(b); + } + dp_packet_hwol_set_csum_udp(b); + } +} + +static void netdev_linux_prepend_vnet_hdr(struct dp_packet *b, int mtu) { - struct virtio_net_hdr *vnet = dp_packet_push_zeros(b, sizeof *vnet); + struct virtio_net_hdr *vnet; + + /* ol_flags weren't set correctly for received packets which are from + * physical port and GROed, so it has to been set again in order that + * vnet_hdr can be prepended correctly. + */ + if ((dp_packet_size(b) > mtu + b->mbuf.l2_len) + && !dp_packet_hwol_l4_mask(b)) { + netdev_linux_set_ol_flags_and_ip_cksum(b, mtu); + } + + vnet = dp_packet_push_zeros(b, sizeof *vnet); - if (dp_packet_hwol_is_tso(b)) { + if (dp_packet_hwol_is_tso(b) || dp_packet_hwol_is_uso(b)) { uint16_t hdr_len = ((char *)dp_packet_l4(b) - (char *)dp_packet_eth(b)) - + TCP_HEADER_LEN; + + b->mbuf.l4_len; vnet->hdr_len = (OVS_FORCE __virtio16)hdr_len; vnet->gso_size = (OVS_FORCE __virtio16)(mtu - hdr_len); - if (dp_packet_hwol_is_ipv4(b)) { + if (dp_packet_hwol_is_uso(b)) { + vnet->gso_type = VIRTIO_NET_HDR_GSO_UDP; + } else if (dp_packet_hwol_is_ipv4(b)) { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; } else { vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;