Message ID | 20200306143445.6616-1-yang_y_yi@163.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,v6] Use TPACKET_V3 to accelerate veth for userspace datapath | expand |
On Fri, Mar 6, 2020 at 6:35 AM <yang_y_yi@163.com> wrote: > > From: Yi Yang <yangyi01@inspur.com> > > We can avoid high system call overhead by using TPACKET_V3 > and using DPDK-like poll to receive and send packets (Note: send > still needs to call sendto to trigger final packet transmission). > > From Linux kernel 3.10 on, TPACKET_V3 has been supported, > so all the Linux kernels current OVS supports can run > TPACKET_V3 without any problem. > > I can see about 30% performance improvement for veth compared to > last recvmmsg optimization if I use TPACKET_V3, it is about 1.98 > Gbps, but it was 1.47 Gbps before. On my testbed, I didn't see any performance gain. For a 100 sec TCP iperf3, I see with/without tpacket show the same 1.70Gbps. Do you think if we set .is_pmd=true, the performance might be better because tpacket is ring-based? > > TPACKET_V3 can support TSO, but its performance isn't good because > of TPACKET_V3 kernel implementation issue, so it falls back to What's the implementation issue? If we use latest kernel, does the issue still exist? > recvmmsg in case userspace-tso-enable is set to true, but its > performance is better than recvmmsg in case userspace-tso-enable is > set to false, so just use TPACKET_V3 in that case. > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > Co-authored-by: William Tu <u9012063@gmail.com> > Signed-off-by: William Tu <u9012063@gmail.com> > --- > acinclude.m4 | 12 ++ > configure.ac | 1 + > include/linux/automake.mk | 1 + > include/linux/if_packet.h | 128 ++++++++++++ > include/sparse/linux/if_packet.h | 111 +++++++++++ > lib/netdev-linux-private.h | 22 +++ > lib/netdev-linux.c | 411 +++++++++++++++++++++++++++++++++++++-- > 7 files changed, 670 insertions(+), 16 deletions(-) > create mode 100644 include/linux/if_packet.h > > Changelog: > - v5->v6 > * Fall back to recvmmsg in case userspace-tso-enable is true > because of TPACKET_V3 kernel implementation issue for tso > support > > - v4->v5 > * Fix travis build issues > * Fix comments issues (capitalize the first letter) > * Verify TSO on Ubuntu 18.04 3.5.0-40-generic > > - v3->v4 > * Fix sparse check errors > > - v2->v3 > * Fix build issues in case HAVE_TPACKET_V3 is not defined > * Add tso-related support code > * make sure it can work normally in case userspace-tso-enable is true > > - v1->v2 > * Remove TPACKET_V1 and TPACKET_V2 which is obsolete > * Add include/linux/if_packet.h > * Change include/sparse/linux/if_packet.h > > diff --git a/acinclude.m4 b/acinclude.m4 > index 1212a46..b39bbb9 100644 > --- a/acinclude.m4 > +++ b/acinclude.m4 > @@ -1093,6 +1093,18 @@ AC_DEFUN([OVS_CHECK_IF_DL], > AC_SEARCH_LIBS([pcap_open_live], [pcap]) > fi]) > > +dnl OVS_CHECK_LINUX_TPACKET > +dnl > +dnl Configure Linux TPACKET. > +AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [ > + AC_COMPILE_IFELSE([ > + AC_LANG_PROGRAM([#include <linux/if_packet.h>], [ > + struct tpacket3_hdr x = { 0 }; > + ])], > + [AC_DEFINE([HAVE_TPACKET_V3], [1], > + [Define to 1 if struct tpacket3_hdr is available.])]) > +]) > + > dnl Checks for buggy strtok_r. > dnl > dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling > diff --git a/configure.ac b/configure.ac > index 1877aae..b61a1f4 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK > OVS_CHECK_COVERAGE > OVS_CHECK_NDEBUG > OVS_CHECK_NETLINK > +OVS_CHECK_LINUX_TPACKET > OVS_CHECK_OPENSSL > OVS_CHECK_LIBCAPNG > OVS_CHECK_LOGDIR > diff --git a/include/linux/automake.mk b/include/linux/automake.mk > index 8f063f4..a659e65 100644 > --- a/include/linux/automake.mk > +++ b/include/linux/automake.mk > @@ -1,4 +1,5 @@ > noinst_HEADERS += \ > + include/linux/if_packet.h \ > include/linux/netlink.h \ > include/linux/netfilter/nf_conntrack_sctp.h \ > include/linux/pkt_cls.h \ > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h > new file mode 100644 > index 0000000..e20aacc > --- /dev/null > +++ b/include/linux/if_packet.h if OVS_CHECK_LINUX_TPACKET returns false, can we simply fall back to recvmmsg? So this is not needed? > @@ -0,0 +1,128 @@ > +#ifndef __LINUX_IF_PACKET_WRAPPER_H > +#define __LINUX_IF_PACKET_WRAPPER_H 1 > + > +#ifdef HAVE_TPACKET_V3 > +#include_next <linux/if_packet.h> > +#else > +#define HAVE_TPACKET_V3 1 > + > +struct sockaddr_pkt { > + unsigned short spkt_family; > + unsigned char spkt_device[14]; > + uint16_t spkt_protocol; > +}; > + > +struct sockaddr_ll { > + unsigned short sll_family; > + uint16_t sll_protocol; > + int sll_ifindex; > + unsigned short sll_hatype; > + unsigned char sll_pkttype; > + unsigned char sll_halen; > + unsigned char sll_addr[8]; > +}; > + > +/* Packet types */ > +#define PACKET_HOST 0 /* To us */ > +#define PACKET_OTHERHOST 3 /* To someone else */ > +#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ > + > +/* Packet socket options */ > +#define PACKET_RX_RING 5 > +#define PACKET_VERSION 10 > +#define PACKET_TX_RING 13 > +#define PACKET_VNET_HDR 15 > + > +/* Rx ring - header status */ > +#define TP_STATUS_KERNEL 0 > +#define TP_STATUS_USER (1 << 0) > +#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ > +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ > + > +/* Tx ring - header status */ > +#define TP_STATUS_SEND_REQUEST (1 << 0) > +#define TP_STATUS_SENDING (1 << 1) > + > +struct tpacket_hdr { > + unsigned long tp_status; > + unsigned int tp_len; > + unsigned int tp_snaplen; > + unsigned short tp_mac; > + unsigned short tp_net; > + unsigned int tp_sec; > + unsigned int tp_usec; > +}; > + > +#define TPACKET_ALIGNMENT 16 > +#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) > + > +struct tpacket_hdr_variant1 { > + uint32_t tp_rxhash; > + uint32_t tp_vlan_tci; > + uint16_t tp_vlan_tpid; > + uint16_t tp_padding; > +}; > + > +struct tpacket3_hdr { > + uint32_t tp_next_offset; > + uint32_t tp_sec; > + uint32_t tp_nsec; > + uint32_t tp_snaplen; > + uint32_t tp_len; > + uint32_t tp_status; > + uint16_t tp_mac; > + uint16_t tp_net; > + /* pkt_hdr variants */ > + union { > + struct tpacket_hdr_variant1 hv1; > + }; > + uint8_t tp_padding[8]; > +}; > + > +struct tpacket_bd_ts { > + unsigned int ts_sec; > + union { > + unsigned int ts_usec; > + unsigned int ts_nsec; > + }; > +}; > + > +struct tpacket_hdr_v1 { > + uint32_t block_status; > + uint32_t num_pkts; > + uint32_t offset_to_first_pkt; > + uint32_t blk_len; > + uint64_t __attribute__((aligned(8))) seq_num; > + struct tpacket_bd_ts ts_first_pkt, ts_last_pkt; > +}; > + > +union tpacket_bd_header_u { > + struct tpacket_hdr_v1 bh1; > +}; > + > +struct tpacket_block_desc { > + uint32_t version; > + uint32_t offset_to_priv; > + union tpacket_bd_header_u hdr; > +}; > + > +#define TPACKET3_HDRLEN \ > + (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll)) > + > +enum tpacket_versions { > + TPACKET_V1, > + TPACKET_V2, > + TPACKET_V3 > +}; > + > +struct tpacket_req3 { > + unsigned int tp_block_size; /* Minimal size of contiguous block */ > + unsigned int tp_block_nr; /* Number of blocks */ > + unsigned int tp_frame_size; /* Size of frame */ > + unsigned int tp_frame_nr; /* Total number of frames */ > + unsigned int tp_retire_blk_tov; /* Timeout in msecs */ > + unsigned int tp_sizeof_priv; /* Offset to private data area */ > + unsigned int tp_feature_req_word; > +}; > +#endif /* HAVE_TPACKET_V3 */ > +#endif /* __LINUX_IF_PACKET_WRAPPER_H */ > diff --git a/include/sparse/linux/if_packet.h b/include/sparse/linux/if_packet.h > index 5ff6d47..0ac3fce 100644 > --- a/include/sparse/linux/if_packet.h > +++ b/include/sparse/linux/if_packet.h Similar here. How about just use recvmmsg? > @@ -5,6 +5,7 @@ > #error "Use this header only with sparse. It is not a correct implementation." > #endif > > +#include <openvswitch/types.h> > #include_next <linux/if_packet.h> > > /* Fix endianness of 'spkt_protocol' and 'sll_protocol' members. */ > @@ -27,4 +28,114 @@ struct sockaddr_ll { > unsigned char sll_addr[8]; > }; > > +/* Packet types */ > +#define PACKET_HOST 0 /* To us */ > +#define PACKET_OTHERHOST 3 /* To someone else */ > +#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ > + > +/* Packet socket options */ > +#define PACKET_RX_RING 5 > +#define PACKET_VERSION 10 > +#define PACKET_TX_RING 13 > +#define PACKET_VNET_HDR 15 > + > +/* Rx ring - header status */ > +#define TP_STATUS_KERNEL 0 > +#define TP_STATUS_USER (1 << 0) > +#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ > +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ > + > +/* Tx ring - header status */ > +#define TP_STATUS_SEND_REQUEST (1 << 0) > +#define TP_STATUS_SENDING (1 << 1) > + > +#define tpacket_hdr rpl_tpacket_hdr > +struct tpacket_hdr { > + unsigned long tp_status; > + unsigned int tp_len; > + unsigned int tp_snaplen; > + unsigned short tp_mac; > + unsigned short tp_net; > + unsigned int tp_sec; > + unsigned int tp_usec; > +}; > + > +#define TPACKET_ALIGNMENT 16 > +#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) > + > +#define tpacket_hdr_variant1 rpl_tpacket_hdr_variant1 > +struct tpacket_hdr_variant1 { > + uint32_t tp_rxhash; > + uint32_t tp_vlan_tci; > + uint16_t tp_vlan_tpid; > + uint16_t tp_padding; > +}; > + > +#define tpacket3_hdr rpl_tpacket3_hdr > +struct tpacket3_hdr { > + uint32_t tp_next_offset; > + uint32_t tp_sec; > + uint32_t tp_nsec; > + uint32_t tp_snaplen; > + uint32_t tp_len; > + uint32_t tp_status; > + uint16_t tp_mac; > + uint16_t tp_net; > + /* pkt_hdr variants */ > + union { > + struct tpacket_hdr_variant1 hv1; > + }; > + uint8_t tp_padding[8]; > +}; > + > +#define tpacket_bd_ts rpl_tpacket_bd_ts > +struct tpacket_bd_ts { > + unsigned int ts_sec; > + union { > + unsigned int ts_usec; > + unsigned int ts_nsec; > + }; > +}; > + > +#define tpacket_hdr_v1 rpl_tpacket_hdr_v1 > +struct tpacket_hdr_v1 { > + uint32_t block_status; > + uint32_t num_pkts; > + uint32_t offset_to_first_pkt; > + uint32_t blk_len; > + uint64_t __attribute__((aligned(8))) seq_num; > + struct tpacket_bd_ts ts_first_pkt, ts_last_pkt; > +}; > + > +#define tpacket_bd_header_u rpl_tpacket_bd_header_u > +union tpacket_bd_header_u { > + struct tpacket_hdr_v1 bh1; > +}; > + > +#define tpacket_block_desc rpl_tpacket_block_desc > +struct tpacket_block_desc { > + uint32_t version; > + uint32_t offset_to_priv; > + union tpacket_bd_header_u hdr; > +}; > + > +#define TPACKET3_HDRLEN \ > + (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll)) > + > +enum rpl_tpacket_versions { > + TPACKET_V1, > + TPACKET_V2, > + TPACKET_V3 > +}; > + > +#define tpacket_req3 rpl_tpacket_req3 > +struct tpacket_req3 { > + unsigned int tp_block_size; /* Minimal size of contiguous block */ > + unsigned int tp_block_nr; /* Number of blocks */ > + unsigned int tp_frame_size; /* Size of frame */ > + unsigned int tp_frame_nr; /* Total number of frames */ > + unsigned int tp_retire_blk_tov; /* Timeout in msecs */ > + unsigned int tp_sizeof_priv; /* Offset to private data area */ > + unsigned int tp_feature_req_word; > +}; > #endif > diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h > index c7c515f..ccd58f4 100644 > --- a/lib/netdev-linux-private.h > +++ b/lib/netdev-linux-private.h > @@ -26,6 +26,7 @@ > #include <linux/mii.h> > #include <stdint.h> > #include <stdbool.h> > +#include <linux/if_packet.h> need to place in order. > > #include "dp-packet.h" > #include "netdev-afxdp.h" > @@ -41,6 +42,22 @@ struct netdev; > /* The maximum packet length is 16 bits */ > #define LINUX_RXQ_TSO_MAX_LEN 65535 > > +#ifdef HAVE_TPACKET_V3 > +struct tpacket_ring { > + int sockfd; > + struct iovec *rd; > + uint8_t *mm_space; > + size_t mm_len, rd_len; > + struct sockaddr_ll ll; > + int type, rd_num, flen; > + struct tpacket_req3 req; > + uint32_t block_num; > + uint32_t frame_num; > + uint32_t frame_num_in_block; > + void * ppd; > +}; > +#endif /* HAVE_TPACKET_V3 */ > + > struct netdev_rxq_linux { > struct netdev_rxq up; > bool is_tap; > @@ -105,6 +122,11 @@ struct netdev_linux { > > int numa_id; /* NUMA node id. */ > > +#ifdef HAVE_TPACKET_V3 > + struct tpacket_ring *tp_rx_ring; > + struct tpacket_ring *tp_tx_ring; > +#endif > + > #ifdef HAVE_AF_XDP > /* AF_XDP information. */ > struct xsk_socket_info **xsks; > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > index c6e46f1..f734086 100644 > --- a/lib/netdev-linux.c > +++ b/lib/netdev-linux.c > @@ -48,6 +48,9 @@ > #include <stdlib.h> > #include <string.h> > #include <unistd.h> > +#ifdef HAVE_TPACKET_V3 > +#include <sys/mman.h> > +#endif > > #include "coverage.h" > #include "dp-packet.h" > @@ -970,6 +973,7 @@ netdev_linux_construct_tap(struct netdev *netdev_) > static const char tap_dev[] = "/dev/net/tun"; > const char *name = netdev_->name; > struct ifreq ifr; > + bool tso = userspace_tso_enabled(); > > int error = netdev_linux_common_construct(netdev_); > if (error) { > @@ -987,7 +991,7 @@ netdev_linux_construct_tap(struct netdev *netdev_) > /* Create tap device. */ > get_flags(&netdev->up, &netdev->ifi_flags); > ifr.ifr_flags = IFF_TAP | IFF_NO_PI; > - if (userspace_tso_enabled()) { > + if (tso) { > ifr.ifr_flags |= IFF_VNET_HDR; > } > > @@ -1012,7 +1016,7 @@ netdev_linux_construct_tap(struct netdev *netdev_) > goto error_close; > } > > - if (userspace_tso_enabled()) { > + if (tso) { > /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is > * available, it will return EINVAL when a flag is unknown. > * Therefore, try enabling offload with no flags to check > @@ -1074,6 +1078,111 @@ netdev_linux_rxq_alloc(void) > return &rx->up; > } > > +#ifdef HAVE_TPACKET_V3 > +static inline struct tpacket3_hdr * > +tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num) > +{ > + uint8_t *f0 = ring->rd[0].iov_base; > + > + return ALIGNED_CAST(struct tpacket3_hdr *, > + f0 + (frame_num * ring->req.tp_frame_size)); > +} > + > +/* > + * ring->rd_num is tp_block_nr, ring->flen is tp_block_size maybe more explanation for the comments. > + */ > +static inline void > +tpacket_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int type) > +{ > + if (type == PACKET_RX_RING) { > + ring->req.tp_retire_blk_tov = 0; > + ring->req.tp_sizeof_priv = 0; > + ring->req.tp_feature_req_word = 0; > + } > + > + if (userspace_tso_enabled()) { > + /* For TX ring, the whole packet must be in one frame > + * so tp_frame_size must big enough to accommodate > + * 64K packet, tpacket3_hdr will occupy some bytes, > + * the final frame size is 64K + 4K = 68K. > + */ > + ring->req.tp_frame_size = (getpagesize() << 4) + getpagesize(); > + ring->req.tp_block_size = ring->req.tp_frame_size; > + } else { > + ring->req.tp_block_size = getpagesize() << 2; > + ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7; > + } > + > + ring->req.tp_block_nr = blocks; > + > + ring->req.tp_frame_nr = ring->req.tp_block_size / > + ring->req.tp_frame_size * > + ring->req.tp_block_nr; > + > + ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr; > + ring->rd_num = ring->req.tp_block_nr; > + ring->flen = ring->req.tp_block_size; > +} > + > +static int > +tpacket_setup_ring(int sock, struct tpacket_ring *ring, int type) > +{ > + int ret = 0; > + unsigned int blocks; > + > + if (userspace_tso_enabled()) { > + blocks = 128; > + } else { > + blocks = 256; > + } > + ring->type = type; > + tpacket_fill_ring(ring, blocks, type); > + ret = setsockopt(sock, SOL_PACKET, type, &ring->req, > + sizeof(ring->req)); > + > + if (ret == -1) { > + return -1; > + } > + > + ring->rd_len = ring->rd_num * sizeof(*ring->rd); > + ring->rd = xmalloc(ring->rd_len); > + if (ring->rd == NULL) { > + return -1; > + } > + > + return 0; > +} > + > +static inline int > +tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring, > + struct tpacket_ring *tx_ring) > +{ > + int i; > + > + rx_ring->mm_space = mmap(NULL, rx_ring->mm_len + tx_ring->mm_len, > + PROT_READ | PROT_WRITE, > + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0); > + if (rx_ring->mm_space == MAP_FAILED) { > + return -1; > + } > + > + memset(rx_ring->rd, 0, rx_ring->rd_len); > + for (i = 0; i < rx_ring->rd_num; ++i) { > + rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * rx_ring->flen); > + rx_ring->rd[i].iov_len = rx_ring->flen; > + } > + > + tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len; > + memset(tx_ring->rd, 0, tx_ring->rd_len); > + for (i = 0; i < tx_ring->rd_num; ++i) { > + tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * tx_ring->flen); > + tx_ring->rd[i].iov_len = tx_ring->flen; > + } > + > + return 0; > +} > +#endif > + > static int > netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > { > @@ -1081,6 +1190,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > struct netdev *netdev_ = rx->up.netdev; > struct netdev_linux *netdev = netdev_linux_cast(netdev_); > int error; > + bool tso = userspace_tso_enabled(); > > ovs_mutex_lock(&netdev->mutex); > rx->is_tap = is_tap_netdev(netdev_); > @@ -1089,6 +1199,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > } else { > struct sockaddr_ll sll; > int ifindex, val; > + > /* Result of tcpdump -dd inbound */ > static const struct sock_filter filt[] = { > { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */ > @@ -1101,7 +1212,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > }; > > /* Create file descriptor. */ > - rx->fd = socket(PF_PACKET, SOCK_RAW, 0); > + rx->fd = socket(PF_PACKET, SOCK_RAW, (OVS_FORCE int) htons(ETH_P_ALL)); > if (rx->fd < 0) { > error = errno; > VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error)); > @@ -1116,7 +1227,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > goto error; > } > > - if (userspace_tso_enabled() > + if (tso > && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val, > sizeof val)) { > error = errno; > @@ -1125,6 +1236,53 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > goto error; > } > > +#ifdef HAVE_TPACKET_V3 > + if (!tso) { > + static int ver = TPACKET_V3; > + > + /* TPACKET_V3 ring setup must be after setsockopt > + * PACKET_VNET_HDR because PACKET_VNET_HDR will return error > + * (EBUSY) if ring is set up > + */ > + error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver, > + sizeof(ver)); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to set tpacket version (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring)); > + netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring)); > + netdev->tp_rx_ring->sockfd = rx->fd; > + netdev->tp_tx_ring->sockfd = rx->fd; > + error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring, > + PACKET_RX_RING); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to set tpacket rx ring (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring, > + PACKET_TX_RING); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to set tpacket tx ring (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring, > + netdev->tp_tx_ring); > + if (error != 0) { > + error = errno; > + VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)", > + netdev_get_name(netdev_), ovs_strerror(error)); > + goto error; > + } > + } > +#endif > + > /* Set non-blocking mode. */ > error = set_nonblocking(rx->fd); > if (error) { > @@ -1139,9 +1297,16 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) > > /* Bind to specific ethernet device. */ > memset(&sll, 0, sizeof sll); > - sll.sll_family = AF_PACKET; > + sll.sll_family = PF_PACKET; What's the difference here? Is using AF_PACKET not work? > +#ifdef HAVE_TPACKET_V3 > + if (!tso) { > + sll.sll_hatype = 0; > + sll.sll_pkttype = 0; > + sll.sll_halen = 0; > + } > +#endif > sll.sll_ifindex = ifindex; > - sll.sll_protocol = htons(ETH_P_ALL); > + sll.sll_protocol = (OVS_FORCE ovs_be16) htons(ETH_P_ALL); > if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) { > error = errno; > VLOG_ERR("%s: failed to bind raw socket (%s)", > @@ -1178,6 +1343,19 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_) > int i; > > if (!rx->is_tap) { > +#ifdef HAVE_TPACKET_V3 > + if (!userspace_tso_enabled()) { > + struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev); > + > + if (netdev->tp_rx_ring) { > + munmap(netdev->tp_rx_ring->mm_space, > + 2 * netdev->tp_rx_ring->mm_len); > + free(netdev->tp_rx_ring->rd); > + free(netdev->tp_tx_ring->rd); > + } > + } > +#endif > + > close(rx->fd); > } > > @@ -1220,8 +1398,8 @@ auxdata_has_vlan_tci(const struct tpacket_auxdata *aux) > * It also used recvmmsg to reduce multiple syscalls overhead; > */ > static int > -netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, > - struct dp_packet_batch *batch) > +netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, bool tso, > + int mtu, struct dp_packet_batch *batch) > { I think this is unrelated changes. We can call userspace_tso_enable() in the function instead of passing extra argument. > int iovlen; > size_t std_len; > @@ -1237,7 +1415,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, > struct dp_packet *buffers[NETDEV_MAX_BURST]; > int i; > > - if (userspace_tso_enabled()) { > + if (tso) { I think this is unrelated changes. > /* Use the buffer from the allocated packet below to receive MTU > * sized packets and an aux_buf for extra TSO data. */ > iovlen = IOV_TSO_SIZE; > @@ -1368,7 +1546,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, > * packets are added into *batch. The return value is 0 or errno. > */ > static int > -netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, > +netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, bool tso, int mtu, > struct dp_packet_batch *batch) I think this is unrelated changes. We can call userspace_tso_enable() in the function instead of passing as argument. > { > int virtio_net_hdr_size; > @@ -1377,7 +1555,7 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, > int iovlen; > int i; > > - if (userspace_tso_enabled()) { > + if (tso) { > /* Use the buffer from the allocated packet below to receive MTU > * sized packets and an aux_buf for extra TSO data. */ > iovlen = IOV_TSO_SIZE; > @@ -1454,6 +1632,109 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, > return 0; > } > > +#ifdef HAVE_TPACKET_V3 > +static int > +netdev_linux_batch_recv_tpacket(struct netdev_rxq_linux *rx, bool tso, int mtu, > + struct dp_packet_batch *batch) > +{ > + struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); > + struct netdev_linux *netdev = netdev_linux_cast(netdev_); > + struct dp_packet *buffer; > + int i = 0; > + unsigned int block_num; > + unsigned int fn_in_block; > + struct tpacket_block_desc *pbd; > + struct tpacket3_hdr *ppd; > + int virtio_net_hdr_size; > + size_t buffer_len; > + > + if (tso) { > + virtio_net_hdr_size = sizeof(struct virtio_net_hdr); > + } else { > + virtio_net_hdr_size = 0; > + } > + buffer_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu; > + > + ppd = ALIGNED_CAST(struct tpacket3_hdr *, netdev->tp_rx_ring->ppd); > + block_num = netdev->tp_rx_ring->block_num; > + fn_in_block = netdev->tp_rx_ring->frame_num_in_block; > + pbd = ALIGNED_CAST(struct tpacket_block_desc *, > + netdev->tp_rx_ring->rd[block_num].iov_base); > + > + while (i < NETDEV_MAX_BURST) { > + if ((pbd->hdr.bh1.block_status & TP_STATUS_USER) == 0) { > + break; > + } > + if (fn_in_block == 0) { > + ppd = ALIGNED_CAST(struct tpacket3_hdr *, (uint8_t *) pbd + > + pbd->hdr.bh1.offset_to_first_pkt); > + } > + > + if (ppd->tp_snaplen > (mtu + VLAN_ETH_HEADER_LEN)) { > + buffer_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN > + + ppd->tp_snaplen; > + } > + > + buffer = dp_packet_new_with_headroom(buffer_len, DP_NETDEV_HEADROOM); > + memcpy(dp_packet_data(buffer), > + (uint8_t *) ppd + ppd->tp_mac - virtio_net_hdr_size, > + ppd->tp_snaplen + virtio_net_hdr_size); > + dp_packet_set_size(buffer, > + dp_packet_size(buffer) + ppd->tp_snaplen > + + virtio_net_hdr_size); > + > + if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffer)) { > + /* Unexpected error situation: the virtio header is not present > + * or corrupted. Drop the packet but continue in case next ones > + * are correct. */ > + dp_packet_delete(buffer); > + netdev->rx_dropped += 1; > + VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header", > + netdev_get_name(netdev_)); > + } else { > + if (ppd->tp_status & TP_STATUS_VLAN_VALID) { > + struct eth_header *eth; > + bool double_tagged; > + ovs_be16 vlan_tpid; > + > + eth = dp_packet_data(buffer); > + double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q); > + if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) { > + vlan_tpid = htons(ppd->hv1.tp_vlan_tpid); > + } else if (double_tagged) { > + vlan_tpid = htons(ETH_TYPE_VLAN_8021AD); > + } else { > + vlan_tpid = htons(ETH_TYPE_VLAN_8021Q); > + } > + eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci)); > + } > + dp_packet_batch_add(batch, buffer); > + } > + > + fn_in_block++; > + if (fn_in_block >= pbd->hdr.bh1.num_pkts) { > + pbd->hdr.bh1.block_status = TP_STATUS_KERNEL; > + block_num = (block_num + 1) % > + netdev->tp_rx_ring->req.tp_block_nr; > + pbd = (struct tpacket_block_desc *) > + netdev->tp_rx_ring->rd[block_num].iov_base; > + fn_in_block = 0; > + ppd = NULL; > + } else { > + ppd = ALIGNED_CAST(struct tpacket3_hdr *, > + (uint8_t *) ppd + ppd->tp_next_offset); > + } > + i++; > + } > + > + netdev->tp_rx_ring->block_num = block_num; > + netdev->tp_rx_ring->frame_num_in_block = fn_in_block; > + netdev->tp_rx_ring->ppd = ppd; > + > + return 0; > +} > +#endif /* HAVE_TPACKET_V3 */ > + > static int > netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, > int *qfill) > @@ -1462,12 +1743,13 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, > struct netdev *netdev = rx->up.netdev; > ssize_t retval; > int mtu; > + bool tso = userspace_tso_enabled(); > > if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) { > mtu = ETH_PAYLOAD_MAX; > } > > - if (userspace_tso_enabled()) { > + if (tso) { > /* Allocate TSO packets. The packet has enough headroom to store > * a full non-TSO packet. When a TSO packet is received, the data > * from non-TSO buffer (std_len) is prepended to the TSO packet > @@ -1485,9 +1767,19 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, > } > > dp_packet_batch_init(batch); > - retval = (rx->is_tap > - ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch) > - : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch)); > + if (rx->is_tap) { > + retval = netdev_linux_batch_rxq_recv_tap(rx, tso, mtu, batch); > + } else { > + if (tso) { > + retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch); > + } else { > +#ifndef HAVE_TPACKET_V3 > + retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch); > +#else > + retval = netdev_linux_batch_recv_tpacket(rx, tso, mtu, batch); > +#endif > + } > + } > > if (retval) { > if (retval != EAGAIN && retval != EMSGSIZE) { > @@ -1692,6 +1984,83 @@ netdev_linux_get_numa_id(const struct netdev *netdev_) > return numa_id; > } > > +#ifdef HAVE_TPACKET_V3 > +static inline int > +tpacket_tx_is_ready(void * next_frame) > +{ > + struct tpacket3_hdr *hdr = ALIGNED_CAST(struct tpacket3_hdr *, next_frame); > + > + return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)); > +} > + > +static int > +netdev_linux_tpacket_batch_send(struct netdev *netdev_, bool tso, int mtu, > + struct dp_packet_batch *batch) > +{ > + struct netdev_linux *netdev = netdev_linux_cast(netdev_); > + struct dp_packet *packet; > + int sockfd; > + ssize_t bytes_sent; > + int total_pkts = 0; > + > + unsigned int frame_nr = netdev->tp_tx_ring->req.tp_frame_nr; > + unsigned int frame_num = netdev->tp_tx_ring->frame_num; > + > + /* The Linux tap driver returns EIO if the device is not up, > + * so if the device is not up, don't waste time sending it. > + * However, if the device is in another network namespace > + * then OVS can't retrieve the state. In that case, send the > + * packets anyway. */ > + if (netdev->present && !(netdev->ifi_flags & IFF_UP)) { > + netdev->tx_dropped += dp_packet_batch_size(batch); > + return 0; > + } > + > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + size_t size; > + struct tpacket3_hdr *ppd; > + > + if (tso) { > + netdev_linux_prepend_vnet_hdr(packet, mtu); > + } > + > + size = dp_packet_size(packet); > + ppd = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num); > + > + if (!tpacket_tx_is_ready(ppd)) { > + break; > + } > + ppd->tp_snaplen = size; > + ppd->tp_len = size; > + ppd->tp_next_offset = 0; > + > + memcpy((uint8_t *)ppd + TPACKET3_HDRLEN - sizeof(struct sockaddr_ll), > + dp_packet_data(packet), > + size); > + ppd->tp_status = TP_STATUS_SEND_REQUEST; > + frame_num = (frame_num + 1) % frame_nr; > + total_pkts++; > + } > + netdev->tp_tx_ring->frame_num = frame_num; > + > + /* Kick-off transmits */ > + if (total_pkts != 0) { > + sockfd = netdev->tp_tx_ring->sockfd; > + bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0); > + if (bytes_sent == -1 && > + errno != ENOBUFS && errno != EAGAIN) { > + /* > + * In case of an ENOBUFS/EAGAIN error all of the enqueued > + * packets will be considered successful even though only some > + * are sent. > + */ > + netdev->tx_dropped += dp_packet_batch_size(batch); > + } > + } > + return 0; > +} > +#endif > + > /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive > * errno value. Returns EAGAIN without blocking if the packet cannot be queued > * immediately. Returns EMSGSIZE if a partial packet was transmitted or if > @@ -1731,7 +2100,17 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, > goto free_batch; > } > > - error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); > + if (tso) { > + error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, > + batch); > + } else { > +#ifndef HAVE_TPACKET_V3 > + error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, > + batch); > +#else > + error = netdev_linux_tpacket_batch_send(netdev_, tso, mtu, batch); > +#endif > + } > } else { > error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch); > } > -- > 1.8.3.1 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
On Tue, Mar 10, 2020 at 7:42 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > Hi, William > > I'll fix some your concerns in next ver, please check other inline replies. > > -----邮件原件----- > 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 William Tu > 发送时间: 2020年3月11日 3:43 > 收件人: yang_y_yi <yang_y_yi@163.com> > 抄送: ovs-dev <ovs-dev@openvswitch.org> > 主题: Re: [ovs-dev] [PATCH v6] Use TPACKET_V3 to accelerate veth for > userspace datapath > > On Fri, Mar 6, 2020 at 6:35 AM <yang_y_yi@163.com> wrote: > > > > From: Yi Yang <yangyi01@inspur.com> > > > > We can avoid high system call overhead by using TPACKET_V3 and using > > DPDK-like poll to receive and send packets (Note: send still needs to > > call sendto to trigger final packet transmission). > > > > From Linux kernel 3.10 on, TPACKET_V3 has been supported, so all the > > Linux kernels current OVS supports can run > > TPACKET_V3 without any problem. > > > > I can see about 30% performance improvement for veth compared to last > > recvmmsg optimization if I use TPACKET_V3, it is about 1.98 Gbps, but > > it was 1.47 Gbps before. > > On my testbed, I didn't see any performance gain. > For a 100 sec TCP iperf3, I see with/without tpacket show the same 1.70Gbps. > Do you think if we set .is_pmd=true, the performance might be better because > tpacket is ring-based? > > [Yi Yang] Please make sure userspace-tso-enabled is set to false for your > test, if it is true, tpacket_v3 isn't used. > > Please use physical machines, it isn't so noticeable if you use it inside > VMs. Here is my data for your reference ( I used a 5.5.7 kernel, but it is > not relevant to kernel version basically). > > My physical machine is a low end server, so performance improvement isn't so > obvious. But a big improvement is retr value is almost 0. To set is_pmd to > true and use dpdk buffer is my next step to improve performance further. I > also have a tpacket_v3 patch for tap in hand. In my previous physical server, > improvement is very obvious. My goal is about 4Gbps, it is 3.9Gbps in my > previous physical server with is_pmd set to true and use dpdk buffer for > dp_packet. With the current patch, is_pmd is always false. How do you set is_pmd to true? > > No tpacket_v3 > ============= > - - - - - - - - - - - - - - - - - - - - - - - - - > [ ID] Interval Transfer Bandwidth Retr > [ 4] 0.00-60.00 sec 7.90 GBytes 1.13 Gbits/sec 39672 sender > [ 4] 0.00-60.00 sec 7.90 GBytes 1.13 Gbits/sec receiver > - - - - - - - - - - - - - - - - - - - - - - - - - > [ ID] Interval Transfer Bandwidth > [ 5] 0.00-60.00 sec 0.00 Bytes 0.00 bits/sec sender > [ 5] 0.00-60.00 sec 7.90 GBytes 1.13 Gbits/sec receiver > <snip> > > iperf Done. > [yangyi@localhost ovs-master]$ uname -a > Linux localhost.localdomain 5.5.7-1.el7.elrepo.x86_64 #1 SMP Fri Feb 28 > 12:21:58 EST 2020 x86_64 x86_64 x86_64 GNU/Linux > tpacket_v3 > ========== <snip> > [ ID] Interval Transfer Bandwidth > [ 5] 0.00-60.02 sec 0.00 Bytes 0.00 bits/sec sender > [ 5] 0.00-60.02 sec 8.39 GBytes 1.20 Gbits/sec receiver > So your current result is no tpacket 1.13G (with some retransmission) with tpacket 1.20G (zero retransmission) This is around 7% improvement. > > > > > > > TPACKET_V3 can support TSO, but its performance isn't good because of > > TPACKET_V3 kernel implementation issue, so it falls back to > > What's the implementation issue? If we use latest kernel, does the issue > still exist? > > [Yi Yang] Per my check, the issue is the kernel can't feed enough packets to > tpacket_recv, so in many cases, no packets received, no 32 packets available, > but for original non-tpacket case, one recv will get 32 packets in most cases, > throughput is about more than twice for veth, for tap case, it is more than > three times, I read kernel source code, but I can't find root cause, I'll > check from tpacket maintainer. > > > recvmmsg in case userspace-tso-enable is set to true, but its > > performance is better than recvmmsg in case userspace-tso-enable is > > set to false, so just use TPACKET_V3 in that case. > > > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > > Co-authored-by: William Tu <u9012063@gmail.com> > > Signed-off-by: William Tu <u9012063@gmail.com> > > --- > > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h new > > file mode 100644 index 0000000..e20aacc > > --- /dev/null > > +++ b/include/linux/if_packet.h > > if OVS_CHECK_LINUX_TPACKET returns false, > can we simply fall back to recvmmsg? > So this is not needed? > > [Yi Yang] As you said, ovs support Linux kernel 3.10.0 or above, so no that > case existing, isn't it? I mean if kernel supports it AND if_packet.h header exists, then we enable it. If kernel supports it AND if_packet.h header does not exist, then just use recvmmsg. Thanks William
Thanks William, replies inline. -----邮件原件----- 发件人: William Tu [mailto:u9012063@gmail.com] 发送时间: 2020年3月12日 1:51 收件人: Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> 抄送: yang_y_yi@163.com; ovs-dev@openvswitch.org 主题: Re: [ovs-dev] [PATCH v6] Use TPACKET_V3 to accelerate veth for userspace datapath On Tue, Mar 10, 2020 at 7:42 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > Hi, William > > I'll fix some your concerns in next ver, please check other inline replies. > > -----邮件原件----- > 发件人: dev [mailto:ovs-dev-bounces@openvswitch.org] 代表 William Tu > 发送时间: 2020年3月11日 3:43 > 收件人: yang_y_yi <yang_y_yi@163.com> > 抄送: ovs-dev <ovs-dev@openvswitch.org> > 主题: Re: [ovs-dev] [PATCH v6] Use TPACKET_V3 to accelerate veth for > userspace datapath > > On Fri, Mar 6, 2020 at 6:35 AM <yang_y_yi@163.com> wrote: > > > > From: Yi Yang <yangyi01@inspur.com> > > > > We can avoid high system call overhead by using TPACKET_V3 and using > > DPDK-like poll to receive and send packets (Note: send still needs > > to call sendto to trigger final packet transmission). > > > > From Linux kernel 3.10 on, TPACKET_V3 has been supported, so all the > > Linux kernels current OVS supports can run > > TPACKET_V3 without any problem. > > > > I can see about 30% performance improvement for veth compared to > > last recvmmsg optimization if I use TPACKET_V3, it is about 1.98 > > Gbps, but it was 1.47 Gbps before. > > On my testbed, I didn't see any performance gain. > For a 100 sec TCP iperf3, I see with/without tpacket show the same 1.70Gbps. > Do you think if we set .is_pmd=true, the performance might be better > because tpacket is ring-based? > > [Yi Yang] Please make sure userspace-tso-enabled is set to false for > your test, if it is true, tpacket_v3 isn't used. > > Please use physical machines, it isn't so noticeable if you use it > inside VMs. Here is my data for your reference ( I used a 5.5.7 > kernel, but it is not relevant to kernel version basically). > > My physical machine is a low end server, so performance improvement > isn't so obvious. But a big improvement is retr value is almost 0. To > set is_pmd to true and use dpdk buffer is my next step to improve > performance further. I also have a tpacket_v3 patch for tap in hand. > In my previous physical server, improvement is very obvious. My goal > is about 4Gbps, it is 3.9Gbps in my previous physical server with > is_pmd set to true and use dpdk buffer for dp_packet. With the current patch, is_pmd is always false. How do you set is_pmd to true? [Yi Yang] I have patches in my hand to do this, my goal is to use pmd thread to handle such case, it is more scalable than ovs-vswitchd, currently, only one ovs-vswitchd is handling all the such interfaces, I don't think it is an efficient way for the use cases which pursue performance. > > No tpacket_v3 > ============= > - - - - - - - - - - - - - - - - - - - - - - - - - > [ ID] Interval Transfer Bandwidth Retr > [ 4] 0.00-60.00 sec 7.90 GBytes 1.13 Gbits/sec 39672 sender > [ 4] 0.00-60.00 sec 7.90 GBytes 1.13 Gbits/sec receiver > - - - - - - - - - - - - - - - - - - - - - - - - - > [ ID] Interval Transfer Bandwidth > [ 5] 0.00-60.00 sec 0.00 Bytes 0.00 bits/sec sender > [ 5] 0.00-60.00 sec 7.90 GBytes 1.13 Gbits/sec receiver > <snip> > > iperf Done. > [yangyi@localhost ovs-master]$ uname -a Linux localhost.localdomain > 5.5.7-1.el7.elrepo.x86_64 #1 SMP Fri Feb 28 > 12:21:58 EST 2020 x86_64 x86_64 x86_64 GNU/Linux > tpacket_v3 > ========== <snip> > [ ID] Interval Transfer Bandwidth > [ 5] 0.00-60.02 sec 0.00 Bytes 0.00 bits/sec sender > [ 5] 0.00-60.02 sec 8.39 GBytes 1.20 Gbits/sec receiver > So your current result is no tpacket 1.13G (with some retransmission) with tpacket 1.20G (zero retransmission) This is around 7% improvement. [Yi Yang] It is so from this test result, but on my high-end server, I did see higher improvement, but I can't use it now, will recheck this once it is available. > > > > > > > TPACKET_V3 can support TSO, but its performance isn't good because > > of > > TPACKET_V3 kernel implementation issue, so it falls back to > > What's the implementation issue? If we use latest kernel, does the > issue still exist? > > [Yi Yang] Per my check, the issue is the kernel can't feed enough > packets to tpacket_recv, so in many cases, no packets received, no 32 > packets available, but for original non-tpacket case, one recv will > get 32 packets in most cases, throughput is about more than twice for > veth, for tap case, it is more than three times, I read kernel source > code, but I can't find root cause, I'll check from tpacket maintainer. > > > recvmmsg in case userspace-tso-enable is set to true, but its > > performance is better than recvmmsg in case userspace-tso-enable is > > set to false, so just use TPACKET_V3 in that case. > > > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > > Co-authored-by: William Tu <u9012063@gmail.com> > > Signed-off-by: William Tu <u9012063@gmail.com> > > --- > > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h > > new file mode 100644 index 0000000..e20aacc > > --- /dev/null > > +++ b/include/linux/if_packet.h > > if OVS_CHECK_LINUX_TPACKET returns false, can we simply fall back to > recvmmsg? > So this is not needed? > > [Yi Yang] As you said, ovs support Linux kernel 3.10.0 or above, so no > that case existing, isn't it? I mean if kernel supports it AND if_packet.h header exists, then we enable it. If kernel supports it AND if_packet.h header does not exist, then just use recvmmsg. [Yi Yang] I'm confused here, Ben told me it should be built even if if_packet.h isn't there, that is why I added if_packet,h in include/linux/if_packet.h, I mean tpacket_v3 code should be built in this case. Thanks William
On Wed, Mar 11, 2020 at 6:14 PM Yi Yang (杨燚)-云服务集团 <yangyi01@inspur.com> wrote: > > > > > > > TPACKET_V3 can support TSO, but its performance isn't good because > > > of > > > TPACKET_V3 kernel implementation issue, so it falls back to > > > > What's the implementation issue? If we use latest kernel, does the > > issue still exist? > > > > [Yi Yang] Per my check, the issue is the kernel can't feed enough > > packets to tpacket_recv, so in many cases, no packets received, no 32 > > packets available, but for original non-tpacket case, one recv will > > get 32 packets in most cases, throughput is about more than twice for > > veth, for tap case, it is more than three times, I read kernel source > > code, but I can't find root cause, I'll check from tpacket maintainer. > > > > > recvmmsg in case userspace-tso-enable is set to true, but its > > > performance is better than recvmmsg in case userspace-tso-enable is > > > set to false, so just use TPACKET_V3 in that case. > > > > > > Signed-off-by: Yi Yang <yangyi01@inspur.com> > > > Co-authored-by: William Tu <u9012063@gmail.com> > > > Signed-off-by: William Tu <u9012063@gmail.com> > > > --- > > > diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h > > > new file mode 100644 index 0000000..e20aacc > > > --- /dev/null > > > +++ b/include/linux/if_packet.h > > > > if OVS_CHECK_LINUX_TPACKET returns false, can we simply fall back to > > recvmmsg? > > So this is not needed? > > > > [Yi Yang] As you said, ovs support Linux kernel 3.10.0 or above, so no > > that case existing, isn't it? > > I mean if kernel supports it AND if_packet.h header exists, then we enable it. > If kernel supports it AND if_packet.h header does not exist, then just use recvmmsg. > > [Yi Yang] I'm confused here, Ben told me it should be built even if if_packet.h isn't there, that is why I added if_packet,h in include/linux/if_packet.h, I mean tpacket_v3 code should be built in this case. > My concern is that since there is not a lot of performance improvement, we don't necessary need to use tpacket_v3. Or we should use tpacket_v3 as an optional configuration, but not default. I remove the if_linux.h in the following diff, and travis works ok. https://travis-ci.org/github/williamtu/ovs-travis/builds/661631098 --- diff --git a/acinclude.m4 b/acinclude.m4 index 1488deda0371..4b11085ab190 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -1086,12 +1086,14 @@ dnl OVS_CHECK_LINUX_TPACKET dnl dnl Configure Linux TPACKET. AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [ - AC_COMPILE_IFELSE([ - AC_LANG_PROGRAM([#include <linux/if_packet.h>], [ - struct tpacket3_hdr x = { 0 }; - ])], - [AC_DEFINE([HAVE_TPACKET_V3], [1], - [Define to 1 if struct tpacket3_hdr is available.])]) + AC_CHECK_HEADER([linux/if_packet.h], + [AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include <linux/if_packet.h>], [ + struct tpacket3_hdr x = { 0 }; + ])], + [AC_DEFINE([HAVE_TPACKET_V3], [1], + [Define to 1 if struct tpacket3_hdr is available.])])], + []) ]) dnl Checks for buggy strtok_r. diff --git a/include/linux/automake.mk b/include/linux/automake.mk index a659e65abe27..8f063f482e15 100644 --- a/include/linux/automake.mk +++ b/include/linux/automake.mk @@ -1,5 +1,4 @@ noinst_HEADERS += \ - include/linux/if_packet.h \ include/linux/netlink.h \ include/linux/netfilter/nf_conntrack_sctp.h \ include/linux/pkt_cls.h \ diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h deleted file mode 100644 index e20aaccb1e32..000000000000 --- a/include/linux/if_packet.h +++ /dev/null @@ -1,128 +0,0 @@ -#ifndef __LINUX_IF_PACKET_WRAPPER_H -#define __LINUX_IF_PACKET_WRAPPER_H 1 - -#ifdef HAVE_TPACKET_V3 -#include_next <linux/if_packet.h> -#else -#define HAVE_TPACKET_V3 1 - -struct sockaddr_pkt { - unsigned short spkt_family; - unsigned char spkt_device[14]; - uint16_t spkt_protocol; -}; - -struct sockaddr_ll { - unsigned short sll_family; - uint16_t sll_protocol; - int sll_ifindex; - unsigned short sll_hatype; - unsigned char sll_pkttype; - unsigned char sll_halen; - unsigned char sll_addr[8]; -}; - -/* Packet types */ -#define PACKET_HOST 0 /* To us */ -#define PACKET_OTHERHOST 3 /* To someone else */ -#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ - -/* Packet socket options */ -#define PACKET_RX_RING 5 -#define PACKET_VERSION 10 -#define PACKET_TX_RING 13 -#define PACKET_VNET_HDR 15 - -/* Rx ring - header status */ -#define TP_STATUS_KERNEL 0 -#define TP_STATUS_USER (1 << 0) -#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ -#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ - -/* Tx ring - header status */ -#define TP_STATUS_SEND_REQUEST (1 << 0) -#define TP_STATUS_SENDING (1 << 1) - -struct tpacket_hdr { - unsigned long tp_status; - unsigned int tp_len; - unsigned int tp_snaplen; - unsigned short tp_mac; - unsigned short tp_net; - unsigned int tp_sec; - unsigned int tp_usec; -}; - -#define TPACKET_ALIGNMENT 16 -#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) - -struct tpacket_hdr_variant1 { - uint32_t tp_rxhash; - uint32_t tp_vlan_tci; - uint16_t tp_vlan_tpid; - uint16_t tp_padding; -}; - -struct tpacket3_hdr { - uint32_t tp_next_offset; - uint32_t tp_sec; - uint32_t tp_nsec; - uint32_t tp_snaplen; - uint32_t tp_len; - uint32_t tp_status; - uint16_t tp_mac; - uint16_t tp_net; - /* pkt_hdr variants */ - union { - struct tpacket_hdr_variant1 hv1; - }; - uint8_t tp_padding[8]; -}; - -struct tpacket_bd_ts { - unsigned int ts_sec; - union { - unsigned int ts_usec; - unsigned int ts_nsec; - }; -}; - -struct tpacket_hdr_v1 { - uint32_t block_status; - uint32_t num_pkts; - uint32_t offset_to_first_pkt; - uint32_t blk_len; - uint64_t __attribute__((aligned(8))) seq_num; - struct tpacket_bd_ts ts_first_pkt, ts_last_pkt; -}; - -union tpacket_bd_header_u { - struct tpacket_hdr_v1 bh1; -}; - -struct tpacket_block_desc { - uint32_t version; - uint32_t offset_to_priv; - union tpacket_bd_header_u hdr; -}; - -#define TPACKET3_HDRLEN \ - (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll)) - -enum tpacket_versions { - TPACKET_V1, - TPACKET_V2, - TPACKET_V3 -}; - -struct tpacket_req3 { - unsigned int tp_block_size; /* Minimal size of contiguous block */ - unsigned int tp_block_nr; /* Number of blocks */ - unsigned int tp_frame_size; /* Size of frame */ - unsigned int tp_frame_nr; /* Total number of frames */ - unsigned int tp_retire_blk_tov; /* Timeout in msecs */ - unsigned int tp_sizeof_priv; /* Offset to private data area */ - unsigned int tp_feature_req_word; -}; -#endif /* HAVE_TPACKET_V3 */ -#endif /* __LINUX_IF_PACKET_WRAPPER_H */ diff --git a/include/sparse/linux/if_packet.h b/include/sparse/linux/if_packet.h index 0ac3fcefc895..3813892a0788 100644 --- a/include/sparse/linux/if_packet.h +++ b/include/sparse/linux/if_packet.h @@ -28,114 +28,4 @@ struct sockaddr_ll { unsigned char sll_addr[8]; }; -/* Packet types */ -#define PACKET_HOST 0 /* To us */ -#define PACKET_OTHERHOST 3 /* To someone else */ -#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ - -/* Packet socket options */ -#define PACKET_RX_RING 5 -#define PACKET_VERSION 10 -#define PACKET_TX_RING 13 -#define PACKET_VNET_HDR 15 - -/* Rx ring - header status */ -#define TP_STATUS_KERNEL 0 -#define TP_STATUS_USER (1 << 0) -#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ -#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ - -/* Tx ring - header status */ -#define TP_STATUS_SEND_REQUEST (1 << 0) -#define TP_STATUS_SENDING (1 << 1) - -#define tpacket_hdr rpl_tpacket_hdr -struct tpacket_hdr { - unsigned long tp_status; - unsigned int tp_len; - unsigned int tp_snaplen; - unsigned short tp_mac; - unsigned short tp_net; - unsigned int tp_sec; - unsigned int tp_usec; -}; - -#define TPACKET_ALIGNMENT 16 -#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) - -#define tpacket_hdr_variant1 rpl_tpacket_hdr_variant1 -struct tpacket_hdr_variant1 { - uint32_t tp_rxhash; - uint32_t tp_vlan_tci; - uint16_t tp_vlan_tpid; - uint16_t tp_padding; -}; - -#define tpacket3_hdr rpl_tpacket3_hdr -struct tpacket3_hdr { - uint32_t tp_next_offset; - uint32_t tp_sec; - uint32_t tp_nsec; - uint32_t tp_snaplen; - uint32_t tp_len; - uint32_t tp_status; - uint16_t tp_mac; - uint16_t tp_net; - /* pkt_hdr variants */ - union { - struct tpacket_hdr_variant1 hv1; - }; - uint8_t tp_padding[8]; -}; - -#define tpacket_bd_ts rpl_tpacket_bd_ts -struct tpacket_bd_ts { - unsigned int ts_sec; - union { - unsigned int ts_usec; - unsigned int ts_nsec; - }; -}; - -#define tpacket_hdr_v1 rpl_tpacket_hdr_v1 -struct tpacket_hdr_v1 { - uint32_t block_status; - uint32_t num_pkts; - uint32_t offset_to_first_pkt; - uint32_t blk_len; - uint64_t __attribute__((aligned(8))) seq_num; - struct tpacket_bd_ts ts_first_pkt, ts_last_pkt; -}; - -#define tpacket_bd_header_u rpl_tpacket_bd_header_u -union tpacket_bd_header_u { - struct tpacket_hdr_v1 bh1; -}; - -#define tpacket_block_desc rpl_tpacket_block_desc -struct tpacket_block_desc { - uint32_t version; - uint32_t offset_to_priv; - union tpacket_bd_header_u hdr; -}; - -#define TPACKET3_HDRLEN \ - (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll)) - -enum rpl_tpacket_versions { - TPACKET_V1, - TPACKET_V2, - TPACKET_V3 -}; - -#define tpacket_req3 rpl_tpacket_req3 -struct tpacket_req3 { - unsigned int tp_block_size; /* Minimal size of contiguous block */ - unsigned int tp_block_nr; /* Number of blocks */ - unsigned int tp_frame_size; /* Size of frame */ - unsigned int tp_frame_nr; /* Total number of frames */ - unsigned int tp_retire_blk_tov; /* Timeout in msecs */ - unsigned int tp_sizeof_priv; /* Offset to private data area */ - unsigned int tp_feature_req_word; -}; #endif
diff --git a/acinclude.m4 b/acinclude.m4 index 1212a46..b39bbb9 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -1093,6 +1093,18 @@ AC_DEFUN([OVS_CHECK_IF_DL], AC_SEARCH_LIBS([pcap_open_live], [pcap]) fi]) +dnl OVS_CHECK_LINUX_TPACKET +dnl +dnl Configure Linux TPACKET. +AC_DEFUN([OVS_CHECK_LINUX_TPACKET], [ + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([#include <linux/if_packet.h>], [ + struct tpacket3_hdr x = { 0 }; + ])], + [AC_DEFINE([HAVE_TPACKET_V3], [1], + [Define to 1 if struct tpacket3_hdr is available.])]) +]) + dnl Checks for buggy strtok_r. dnl dnl Some versions of glibc 2.7 has a bug in strtok_r when compiling diff --git a/configure.ac b/configure.ac index 1877aae..b61a1f4 100644 --- a/configure.ac +++ b/configure.ac @@ -89,6 +89,7 @@ OVS_CHECK_VISUAL_STUDIO_DDK OVS_CHECK_COVERAGE OVS_CHECK_NDEBUG OVS_CHECK_NETLINK +OVS_CHECK_LINUX_TPACKET OVS_CHECK_OPENSSL OVS_CHECK_LIBCAPNG OVS_CHECK_LOGDIR diff --git a/include/linux/automake.mk b/include/linux/automake.mk index 8f063f4..a659e65 100644 --- a/include/linux/automake.mk +++ b/include/linux/automake.mk @@ -1,4 +1,5 @@ noinst_HEADERS += \ + include/linux/if_packet.h \ include/linux/netlink.h \ include/linux/netfilter/nf_conntrack_sctp.h \ include/linux/pkt_cls.h \ diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h new file mode 100644 index 0000000..e20aacc --- /dev/null +++ b/include/linux/if_packet.h @@ -0,0 +1,128 @@ +#ifndef __LINUX_IF_PACKET_WRAPPER_H +#define __LINUX_IF_PACKET_WRAPPER_H 1 + +#ifdef HAVE_TPACKET_V3 +#include_next <linux/if_packet.h> +#else +#define HAVE_TPACKET_V3 1 + +struct sockaddr_pkt { + unsigned short spkt_family; + unsigned char spkt_device[14]; + uint16_t spkt_protocol; +}; + +struct sockaddr_ll { + unsigned short sll_family; + uint16_t sll_protocol; + int sll_ifindex; + unsigned short sll_hatype; + unsigned char sll_pkttype; + unsigned char sll_halen; + unsigned char sll_addr[8]; +}; + +/* Packet types */ +#define PACKET_HOST 0 /* To us */ +#define PACKET_OTHERHOST 3 /* To someone else */ +#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ + +/* Packet socket options */ +#define PACKET_RX_RING 5 +#define PACKET_VERSION 10 +#define PACKET_TX_RING 13 +#define PACKET_VNET_HDR 15 + +/* Rx ring - header status */ +#define TP_STATUS_KERNEL 0 +#define TP_STATUS_USER (1 << 0) +#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ + +/* Tx ring - header status */ +#define TP_STATUS_SEND_REQUEST (1 << 0) +#define TP_STATUS_SENDING (1 << 1) + +struct tpacket_hdr { + unsigned long tp_status; + unsigned int tp_len; + unsigned int tp_snaplen; + unsigned short tp_mac; + unsigned short tp_net; + unsigned int tp_sec; + unsigned int tp_usec; +}; + +#define TPACKET_ALIGNMENT 16 +#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) + +struct tpacket_hdr_variant1 { + uint32_t tp_rxhash; + uint32_t tp_vlan_tci; + uint16_t tp_vlan_tpid; + uint16_t tp_padding; +}; + +struct tpacket3_hdr { + uint32_t tp_next_offset; + uint32_t tp_sec; + uint32_t tp_nsec; + uint32_t tp_snaplen; + uint32_t tp_len; + uint32_t tp_status; + uint16_t tp_mac; + uint16_t tp_net; + /* pkt_hdr variants */ + union { + struct tpacket_hdr_variant1 hv1; + }; + uint8_t tp_padding[8]; +}; + +struct tpacket_bd_ts { + unsigned int ts_sec; + union { + unsigned int ts_usec; + unsigned int ts_nsec; + }; +}; + +struct tpacket_hdr_v1 { + uint32_t block_status; + uint32_t num_pkts; + uint32_t offset_to_first_pkt; + uint32_t blk_len; + uint64_t __attribute__((aligned(8))) seq_num; + struct tpacket_bd_ts ts_first_pkt, ts_last_pkt; +}; + +union tpacket_bd_header_u { + struct tpacket_hdr_v1 bh1; +}; + +struct tpacket_block_desc { + uint32_t version; + uint32_t offset_to_priv; + union tpacket_bd_header_u hdr; +}; + +#define TPACKET3_HDRLEN \ + (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll)) + +enum tpacket_versions { + TPACKET_V1, + TPACKET_V2, + TPACKET_V3 +}; + +struct tpacket_req3 { + unsigned int tp_block_size; /* Minimal size of contiguous block */ + unsigned int tp_block_nr; /* Number of blocks */ + unsigned int tp_frame_size; /* Size of frame */ + unsigned int tp_frame_nr; /* Total number of frames */ + unsigned int tp_retire_blk_tov; /* Timeout in msecs */ + unsigned int tp_sizeof_priv; /* Offset to private data area */ + unsigned int tp_feature_req_word; +}; +#endif /* HAVE_TPACKET_V3 */ +#endif /* __LINUX_IF_PACKET_WRAPPER_H */ diff --git a/include/sparse/linux/if_packet.h b/include/sparse/linux/if_packet.h index 5ff6d47..0ac3fce 100644 --- a/include/sparse/linux/if_packet.h +++ b/include/sparse/linux/if_packet.h @@ -5,6 +5,7 @@ #error "Use this header only with sparse. It is not a correct implementation." #endif +#include <openvswitch/types.h> #include_next <linux/if_packet.h> /* Fix endianness of 'spkt_protocol' and 'sll_protocol' members. */ @@ -27,4 +28,114 @@ struct sockaddr_ll { unsigned char sll_addr[8]; }; +/* Packet types */ +#define PACKET_HOST 0 /* To us */ +#define PACKET_OTHERHOST 3 /* To someone else */ +#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */ + +/* Packet socket options */ +#define PACKET_RX_RING 5 +#define PACKET_VERSION 10 +#define PACKET_TX_RING 13 +#define PACKET_VNET_HDR 15 + +/* Rx ring - header status */ +#define TP_STATUS_KERNEL 0 +#define TP_STATUS_USER (1 << 0) +#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */ +#define TP_STATUS_VLAN_TPID_VALID (1 << 6) /* auxdata has valid tp_vlan_tpid */ + +/* Tx ring - header status */ +#define TP_STATUS_SEND_REQUEST (1 << 0) +#define TP_STATUS_SENDING (1 << 1) + +#define tpacket_hdr rpl_tpacket_hdr +struct tpacket_hdr { + unsigned long tp_status; + unsigned int tp_len; + unsigned int tp_snaplen; + unsigned short tp_mac; + unsigned short tp_net; + unsigned int tp_sec; + unsigned int tp_usec; +}; + +#define TPACKET_ALIGNMENT 16 +#define TPACKET_ALIGN(x) (((x)+TPACKET_ALIGNMENT-1)&~(TPACKET_ALIGNMENT-1)) + +#define tpacket_hdr_variant1 rpl_tpacket_hdr_variant1 +struct tpacket_hdr_variant1 { + uint32_t tp_rxhash; + uint32_t tp_vlan_tci; + uint16_t tp_vlan_tpid; + uint16_t tp_padding; +}; + +#define tpacket3_hdr rpl_tpacket3_hdr +struct tpacket3_hdr { + uint32_t tp_next_offset; + uint32_t tp_sec; + uint32_t tp_nsec; + uint32_t tp_snaplen; + uint32_t tp_len; + uint32_t tp_status; + uint16_t tp_mac; + uint16_t tp_net; + /* pkt_hdr variants */ + union { + struct tpacket_hdr_variant1 hv1; + }; + uint8_t tp_padding[8]; +}; + +#define tpacket_bd_ts rpl_tpacket_bd_ts +struct tpacket_bd_ts { + unsigned int ts_sec; + union { + unsigned int ts_usec; + unsigned int ts_nsec; + }; +}; + +#define tpacket_hdr_v1 rpl_tpacket_hdr_v1 +struct tpacket_hdr_v1 { + uint32_t block_status; + uint32_t num_pkts; + uint32_t offset_to_first_pkt; + uint32_t blk_len; + uint64_t __attribute__((aligned(8))) seq_num; + struct tpacket_bd_ts ts_first_pkt, ts_last_pkt; +}; + +#define tpacket_bd_header_u rpl_tpacket_bd_header_u +union tpacket_bd_header_u { + struct tpacket_hdr_v1 bh1; +}; + +#define tpacket_block_desc rpl_tpacket_block_desc +struct tpacket_block_desc { + uint32_t version; + uint32_t offset_to_priv; + union tpacket_bd_header_u hdr; +}; + +#define TPACKET3_HDRLEN \ + (TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll)) + +enum rpl_tpacket_versions { + TPACKET_V1, + TPACKET_V2, + TPACKET_V3 +}; + +#define tpacket_req3 rpl_tpacket_req3 +struct tpacket_req3 { + unsigned int tp_block_size; /* Minimal size of contiguous block */ + unsigned int tp_block_nr; /* Number of blocks */ + unsigned int tp_frame_size; /* Size of frame */ + unsigned int tp_frame_nr; /* Total number of frames */ + unsigned int tp_retire_blk_tov; /* Timeout in msecs */ + unsigned int tp_sizeof_priv; /* Offset to private data area */ + unsigned int tp_feature_req_word; +}; #endif diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h index c7c515f..ccd58f4 100644 --- a/lib/netdev-linux-private.h +++ b/lib/netdev-linux-private.h @@ -26,6 +26,7 @@ #include <linux/mii.h> #include <stdint.h> #include <stdbool.h> +#include <linux/if_packet.h> #include "dp-packet.h" #include "netdev-afxdp.h" @@ -41,6 +42,22 @@ struct netdev; /* The maximum packet length is 16 bits */ #define LINUX_RXQ_TSO_MAX_LEN 65535 +#ifdef HAVE_TPACKET_V3 +struct tpacket_ring { + int sockfd; + struct iovec *rd; + uint8_t *mm_space; + size_t mm_len, rd_len; + struct sockaddr_ll ll; + int type, rd_num, flen; + struct tpacket_req3 req; + uint32_t block_num; + uint32_t frame_num; + uint32_t frame_num_in_block; + void * ppd; +}; +#endif /* HAVE_TPACKET_V3 */ + struct netdev_rxq_linux { struct netdev_rxq up; bool is_tap; @@ -105,6 +122,11 @@ struct netdev_linux { int numa_id; /* NUMA node id. */ +#ifdef HAVE_TPACKET_V3 + struct tpacket_ring *tp_rx_ring; + struct tpacket_ring *tp_tx_ring; +#endif + #ifdef HAVE_AF_XDP /* AF_XDP information. */ struct xsk_socket_info **xsks; diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c index c6e46f1..f734086 100644 --- a/lib/netdev-linux.c +++ b/lib/netdev-linux.c @@ -48,6 +48,9 @@ #include <stdlib.h> #include <string.h> #include <unistd.h> +#ifdef HAVE_TPACKET_V3 +#include <sys/mman.h> +#endif #include "coverage.h" #include "dp-packet.h" @@ -970,6 +973,7 @@ netdev_linux_construct_tap(struct netdev *netdev_) static const char tap_dev[] = "/dev/net/tun"; const char *name = netdev_->name; struct ifreq ifr; + bool tso = userspace_tso_enabled(); int error = netdev_linux_common_construct(netdev_); if (error) { @@ -987,7 +991,7 @@ netdev_linux_construct_tap(struct netdev *netdev_) /* Create tap device. */ get_flags(&netdev->up, &netdev->ifi_flags); ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - if (userspace_tso_enabled()) { + if (tso) { ifr.ifr_flags |= IFF_VNET_HDR; } @@ -1012,7 +1016,7 @@ netdev_linux_construct_tap(struct netdev *netdev_) goto error_close; } - if (userspace_tso_enabled()) { + if (tso) { /* Old kernels don't support TUNSETOFFLOAD. If TUNSETOFFLOAD is * available, it will return EINVAL when a flag is unknown. * Therefore, try enabling offload with no flags to check @@ -1074,6 +1078,111 @@ netdev_linux_rxq_alloc(void) return &rx->up; } +#ifdef HAVE_TPACKET_V3 +static inline struct tpacket3_hdr * +tpacket_get_next_frame(struct tpacket_ring *ring, uint32_t frame_num) +{ + uint8_t *f0 = ring->rd[0].iov_base; + + return ALIGNED_CAST(struct tpacket3_hdr *, + f0 + (frame_num * ring->req.tp_frame_size)); +} + +/* + * ring->rd_num is tp_block_nr, ring->flen is tp_block_size + */ +static inline void +tpacket_fill_ring(struct tpacket_ring *ring, unsigned int blocks, int type) +{ + if (type == PACKET_RX_RING) { + ring->req.tp_retire_blk_tov = 0; + ring->req.tp_sizeof_priv = 0; + ring->req.tp_feature_req_word = 0; + } + + if (userspace_tso_enabled()) { + /* For TX ring, the whole packet must be in one frame + * so tp_frame_size must big enough to accommodate + * 64K packet, tpacket3_hdr will occupy some bytes, + * the final frame size is 64K + 4K = 68K. + */ + ring->req.tp_frame_size = (getpagesize() << 4) + getpagesize(); + ring->req.tp_block_size = ring->req.tp_frame_size; + } else { + ring->req.tp_block_size = getpagesize() << 2; + ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7; + } + + ring->req.tp_block_nr = blocks; + + ring->req.tp_frame_nr = ring->req.tp_block_size / + ring->req.tp_frame_size * + ring->req.tp_block_nr; + + ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr; + ring->rd_num = ring->req.tp_block_nr; + ring->flen = ring->req.tp_block_size; +} + +static int +tpacket_setup_ring(int sock, struct tpacket_ring *ring, int type) +{ + int ret = 0; + unsigned int blocks; + + if (userspace_tso_enabled()) { + blocks = 128; + } else { + blocks = 256; + } + ring->type = type; + tpacket_fill_ring(ring, blocks, type); + ret = setsockopt(sock, SOL_PACKET, type, &ring->req, + sizeof(ring->req)); + + if (ret == -1) { + return -1; + } + + ring->rd_len = ring->rd_num * sizeof(*ring->rd); + ring->rd = xmalloc(ring->rd_len); + if (ring->rd == NULL) { + return -1; + } + + return 0; +} + +static inline int +tpacket_mmap_rx_tx_ring(int sock, struct tpacket_ring *rx_ring, + struct tpacket_ring *tx_ring) +{ + int i; + + rx_ring->mm_space = mmap(NULL, rx_ring->mm_len + tx_ring->mm_len, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0); + if (rx_ring->mm_space == MAP_FAILED) { + return -1; + } + + memset(rx_ring->rd, 0, rx_ring->rd_len); + for (i = 0; i < rx_ring->rd_num; ++i) { + rx_ring->rd[i].iov_base = rx_ring->mm_space + (i * rx_ring->flen); + rx_ring->rd[i].iov_len = rx_ring->flen; + } + + tx_ring->mm_space = rx_ring->mm_space + rx_ring->mm_len; + memset(tx_ring->rd, 0, tx_ring->rd_len); + for (i = 0; i < tx_ring->rd_num; ++i) { + tx_ring->rd[i].iov_base = tx_ring->mm_space + (i * tx_ring->flen); + tx_ring->rd[i].iov_len = tx_ring->flen; + } + + return 0; +} +#endif + static int netdev_linux_rxq_construct(struct netdev_rxq *rxq_) { @@ -1081,6 +1190,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) struct netdev *netdev_ = rx->up.netdev; struct netdev_linux *netdev = netdev_linux_cast(netdev_); int error; + bool tso = userspace_tso_enabled(); ovs_mutex_lock(&netdev->mutex); rx->is_tap = is_tap_netdev(netdev_); @@ -1089,6 +1199,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) } else { struct sockaddr_ll sll; int ifindex, val; + /* Result of tcpdump -dd inbound */ static const struct sock_filter filt[] = { { 0x28, 0, 0, 0xfffff004 }, /* ldh [0] */ @@ -1101,7 +1212,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) }; /* Create file descriptor. */ - rx->fd = socket(PF_PACKET, SOCK_RAW, 0); + rx->fd = socket(PF_PACKET, SOCK_RAW, (OVS_FORCE int) htons(ETH_P_ALL)); if (rx->fd < 0) { error = errno; VLOG_ERR("failed to create raw socket (%s)", ovs_strerror(error)); @@ -1116,7 +1227,7 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) goto error; } - if (userspace_tso_enabled() + if (tso && setsockopt(rx->fd, SOL_PACKET, PACKET_VNET_HDR, &val, sizeof val)) { error = errno; @@ -1125,6 +1236,53 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) goto error; } +#ifdef HAVE_TPACKET_V3 + if (!tso) { + static int ver = TPACKET_V3; + + /* TPACKET_V3 ring setup must be after setsockopt + * PACKET_VNET_HDR because PACKET_VNET_HDR will return error + * (EBUSY) if ring is set up + */ + error = setsockopt(rx->fd, SOL_PACKET, PACKET_VERSION, &ver, + sizeof(ver)); + if (error != 0) { + error = errno; + VLOG_ERR("%s: failed to set tpacket version (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + netdev->tp_rx_ring = xzalloc(sizeof(struct tpacket_ring)); + netdev->tp_tx_ring = xzalloc(sizeof(struct tpacket_ring)); + netdev->tp_rx_ring->sockfd = rx->fd; + netdev->tp_tx_ring->sockfd = rx->fd; + error = tpacket_setup_ring(rx->fd, netdev->tp_rx_ring, + PACKET_RX_RING); + if (error != 0) { + error = errno; + VLOG_ERR("%s: failed to set tpacket rx ring (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + error = tpacket_setup_ring(rx->fd, netdev->tp_tx_ring, + PACKET_TX_RING); + if (error != 0) { + error = errno; + VLOG_ERR("%s: failed to set tpacket tx ring (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + error = tpacket_mmap_rx_tx_ring(rx->fd, netdev->tp_rx_ring, + netdev->tp_tx_ring); + if (error != 0) { + error = errno; + VLOG_ERR("%s: failed to mmap tpacket rx & tx ring (%s)", + netdev_get_name(netdev_), ovs_strerror(error)); + goto error; + } + } +#endif + /* Set non-blocking mode. */ error = set_nonblocking(rx->fd); if (error) { @@ -1139,9 +1297,16 @@ netdev_linux_rxq_construct(struct netdev_rxq *rxq_) /* Bind to specific ethernet device. */ memset(&sll, 0, sizeof sll); - sll.sll_family = AF_PACKET; + sll.sll_family = PF_PACKET; +#ifdef HAVE_TPACKET_V3 + if (!tso) { + sll.sll_hatype = 0; + sll.sll_pkttype = 0; + sll.sll_halen = 0; + } +#endif sll.sll_ifindex = ifindex; - sll.sll_protocol = htons(ETH_P_ALL); + sll.sll_protocol = (OVS_FORCE ovs_be16) htons(ETH_P_ALL); if (bind(rx->fd, (struct sockaddr *) &sll, sizeof sll) < 0) { error = errno; VLOG_ERR("%s: failed to bind raw socket (%s)", @@ -1178,6 +1343,19 @@ netdev_linux_rxq_destruct(struct netdev_rxq *rxq_) int i; if (!rx->is_tap) { +#ifdef HAVE_TPACKET_V3 + if (!userspace_tso_enabled()) { + struct netdev_linux *netdev = netdev_linux_cast(rx->up.netdev); + + if (netdev->tp_rx_ring) { + munmap(netdev->tp_rx_ring->mm_space, + 2 * netdev->tp_rx_ring->mm_len); + free(netdev->tp_rx_ring->rd); + free(netdev->tp_tx_ring->rd); + } + } +#endif + close(rx->fd); } @@ -1220,8 +1398,8 @@ auxdata_has_vlan_tci(const struct tpacket_auxdata *aux) * It also used recvmmsg to reduce multiple syscalls overhead; */ static int -netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, - struct dp_packet_batch *batch) +netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, bool tso, + int mtu, struct dp_packet_batch *batch) { int iovlen; size_t std_len; @@ -1237,7 +1415,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, struct dp_packet *buffers[NETDEV_MAX_BURST]; int i; - if (userspace_tso_enabled()) { + if (tso) { /* Use the buffer from the allocated packet below to receive MTU * sized packets and an aux_buf for extra TSO data. */ iovlen = IOV_TSO_SIZE; @@ -1368,7 +1546,7 @@ netdev_linux_batch_rxq_recv_sock(struct netdev_rxq_linux *rx, int mtu, * packets are added into *batch. The return value is 0 or errno. */ static int -netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, +netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, bool tso, int mtu, struct dp_packet_batch *batch) { int virtio_net_hdr_size; @@ -1377,7 +1555,7 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, int iovlen; int i; - if (userspace_tso_enabled()) { + if (tso) { /* Use the buffer from the allocated packet below to receive MTU * sized packets and an aux_buf for extra TSO data. */ iovlen = IOV_TSO_SIZE; @@ -1454,6 +1632,109 @@ netdev_linux_batch_rxq_recv_tap(struct netdev_rxq_linux *rx, int mtu, return 0; } +#ifdef HAVE_TPACKET_V3 +static int +netdev_linux_batch_recv_tpacket(struct netdev_rxq_linux *rx, bool tso, int mtu, + struct dp_packet_batch *batch) +{ + struct netdev *netdev_ = netdev_rxq_get_netdev(&rx->up); + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + struct dp_packet *buffer; + int i = 0; + unsigned int block_num; + unsigned int fn_in_block; + struct tpacket_block_desc *pbd; + struct tpacket3_hdr *ppd; + int virtio_net_hdr_size; + size_t buffer_len; + + if (tso) { + virtio_net_hdr_size = sizeof(struct virtio_net_hdr); + } else { + virtio_net_hdr_size = 0; + } + buffer_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + mtu; + + ppd = ALIGNED_CAST(struct tpacket3_hdr *, netdev->tp_rx_ring->ppd); + block_num = netdev->tp_rx_ring->block_num; + fn_in_block = netdev->tp_rx_ring->frame_num_in_block; + pbd = ALIGNED_CAST(struct tpacket_block_desc *, + netdev->tp_rx_ring->rd[block_num].iov_base); + + while (i < NETDEV_MAX_BURST) { + if ((pbd->hdr.bh1.block_status & TP_STATUS_USER) == 0) { + break; + } + if (fn_in_block == 0) { + ppd = ALIGNED_CAST(struct tpacket3_hdr *, (uint8_t *) pbd + + pbd->hdr.bh1.offset_to_first_pkt); + } + + if (ppd->tp_snaplen > (mtu + VLAN_ETH_HEADER_LEN)) { + buffer_len = virtio_net_hdr_size + VLAN_ETH_HEADER_LEN + + ppd->tp_snaplen; + } + + buffer = dp_packet_new_with_headroom(buffer_len, DP_NETDEV_HEADROOM); + memcpy(dp_packet_data(buffer), + (uint8_t *) ppd + ppd->tp_mac - virtio_net_hdr_size, + ppd->tp_snaplen + virtio_net_hdr_size); + dp_packet_set_size(buffer, + dp_packet_size(buffer) + ppd->tp_snaplen + + virtio_net_hdr_size); + + if (virtio_net_hdr_size && netdev_linux_parse_vnet_hdr(buffer)) { + /* Unexpected error situation: the virtio header is not present + * or corrupted. Drop the packet but continue in case next ones + * are correct. */ + dp_packet_delete(buffer); + netdev->rx_dropped += 1; + VLOG_WARN_RL(&rl, "%s: Dropped packet: Invalid virtio net header", + netdev_get_name(netdev_)); + } else { + if (ppd->tp_status & TP_STATUS_VLAN_VALID) { + struct eth_header *eth; + bool double_tagged; + ovs_be16 vlan_tpid; + + eth = dp_packet_data(buffer); + double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q); + if (ppd->tp_status & TP_STATUS_VLAN_TPID_VALID) { + vlan_tpid = htons(ppd->hv1.tp_vlan_tpid); + } else if (double_tagged) { + vlan_tpid = htons(ETH_TYPE_VLAN_8021AD); + } else { + vlan_tpid = htons(ETH_TYPE_VLAN_8021Q); + } + eth_push_vlan(buffer, vlan_tpid, htons(ppd->hv1.tp_vlan_tci)); + } + dp_packet_batch_add(batch, buffer); + } + + fn_in_block++; + if (fn_in_block >= pbd->hdr.bh1.num_pkts) { + pbd->hdr.bh1.block_status = TP_STATUS_KERNEL; + block_num = (block_num + 1) % + netdev->tp_rx_ring->req.tp_block_nr; + pbd = (struct tpacket_block_desc *) + netdev->tp_rx_ring->rd[block_num].iov_base; + fn_in_block = 0; + ppd = NULL; + } else { + ppd = ALIGNED_CAST(struct tpacket3_hdr *, + (uint8_t *) ppd + ppd->tp_next_offset); + } + i++; + } + + netdev->tp_rx_ring->block_num = block_num; + netdev->tp_rx_ring->frame_num_in_block = fn_in_block; + netdev->tp_rx_ring->ppd = ppd; + + return 0; +} +#endif /* HAVE_TPACKET_V3 */ + static int netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, int *qfill) @@ -1462,12 +1743,13 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, struct netdev *netdev = rx->up.netdev; ssize_t retval; int mtu; + bool tso = userspace_tso_enabled(); if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) { mtu = ETH_PAYLOAD_MAX; } - if (userspace_tso_enabled()) { + if (tso) { /* Allocate TSO packets. The packet has enough headroom to store * a full non-TSO packet. When a TSO packet is received, the data * from non-TSO buffer (std_len) is prepended to the TSO packet @@ -1485,9 +1767,19 @@ netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch, } dp_packet_batch_init(batch); - retval = (rx->is_tap - ? netdev_linux_batch_rxq_recv_tap(rx, mtu, batch) - : netdev_linux_batch_rxq_recv_sock(rx, mtu, batch)); + if (rx->is_tap) { + retval = netdev_linux_batch_rxq_recv_tap(rx, tso, mtu, batch); + } else { + if (tso) { + retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch); + } else { +#ifndef HAVE_TPACKET_V3 + retval = netdev_linux_batch_rxq_recv_sock(rx, tso, mtu, batch); +#else + retval = netdev_linux_batch_recv_tpacket(rx, tso, mtu, batch); +#endif + } + } if (retval) { if (retval != EAGAIN && retval != EMSGSIZE) { @@ -1692,6 +1984,83 @@ netdev_linux_get_numa_id(const struct netdev *netdev_) return numa_id; } +#ifdef HAVE_TPACKET_V3 +static inline int +tpacket_tx_is_ready(void * next_frame) +{ + struct tpacket3_hdr *hdr = ALIGNED_CAST(struct tpacket3_hdr *, next_frame); + + return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)); +} + +static int +netdev_linux_tpacket_batch_send(struct netdev *netdev_, bool tso, int mtu, + struct dp_packet_batch *batch) +{ + struct netdev_linux *netdev = netdev_linux_cast(netdev_); + struct dp_packet *packet; + int sockfd; + ssize_t bytes_sent; + int total_pkts = 0; + + unsigned int frame_nr = netdev->tp_tx_ring->req.tp_frame_nr; + unsigned int frame_num = netdev->tp_tx_ring->frame_num; + + /* The Linux tap driver returns EIO if the device is not up, + * so if the device is not up, don't waste time sending it. + * However, if the device is in another network namespace + * then OVS can't retrieve the state. In that case, send the + * packets anyway. */ + if (netdev->present && !(netdev->ifi_flags & IFF_UP)) { + netdev->tx_dropped += dp_packet_batch_size(batch); + return 0; + } + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + size_t size; + struct tpacket3_hdr *ppd; + + if (tso) { + netdev_linux_prepend_vnet_hdr(packet, mtu); + } + + size = dp_packet_size(packet); + ppd = tpacket_get_next_frame(netdev->tp_tx_ring, frame_num); + + if (!tpacket_tx_is_ready(ppd)) { + break; + } + ppd->tp_snaplen = size; + ppd->tp_len = size; + ppd->tp_next_offset = 0; + + memcpy((uint8_t *)ppd + TPACKET3_HDRLEN - sizeof(struct sockaddr_ll), + dp_packet_data(packet), + size); + ppd->tp_status = TP_STATUS_SEND_REQUEST; + frame_num = (frame_num + 1) % frame_nr; + total_pkts++; + } + netdev->tp_tx_ring->frame_num = frame_num; + + /* Kick-off transmits */ + if (total_pkts != 0) { + sockfd = netdev->tp_tx_ring->sockfd; + bytes_sent = sendto(sockfd, NULL, 0, MSG_DONTWAIT, NULL, 0); + if (bytes_sent == -1 && + errno != ENOBUFS && errno != EAGAIN) { + /* + * In case of an ENOBUFS/EAGAIN error all of the enqueued + * packets will be considered successful even though only some + * are sent. + */ + netdev->tx_dropped += dp_packet_batch_size(batch); + } + } + return 0; +} +#endif + /* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a positive * errno value. Returns EAGAIN without blocking if the packet cannot be queued * immediately. Returns EMSGSIZE if a partial packet was transmitted or if @@ -1731,7 +2100,17 @@ netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED, goto free_batch; } - error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, batch); + if (tso) { + error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, + batch); + } else { +#ifndef HAVE_TPACKET_V3 + error = netdev_linux_sock_batch_send(sock, ifindex, tso, mtu, + batch); +#else + error = netdev_linux_tpacket_batch_send(netdev_, tso, mtu, batch); +#endif + } } else { error = netdev_linux_tap_batch_send(netdev_, tso, mtu, batch); }