[ovs-dev,v2] Use batch process recv for tap and raw socket in netdev datapath
diff mbox series

Message ID 1576636527-14846-1-git-send-email-yang_y_yi@163.com
State New
Headers show
Series
  • [ovs-dev,v2] Use batch process recv for tap and raw socket in netdev datapath
Related show

Commit Message

yang_y_yi Dec. 18, 2019, 2:35 a.m. UTC
From: Yi Yang <yangyi01@inspur.com>

Current netdev_linux_rxq_recv_tap and netdev_linux_rxq_recv_sock
just receive single packet, that is very inefficient, per my test
case which adds two tap ports or veth ports into OVS bridge
(datapath_type=netdev) and use iperf3 to do performance test
between two ports (they are set into different network name space).

The result is as below:

  tap:  295 Mbits/sec
  veth: 207 Mbits/sec

After I change netdev_linux_rxq_recv_tap and
netdev_linux_rxq_recv_sock to use batch process, the performance
is boosted by about 7 times, here is the result:

  tap:  1.96 Gbits/sec
  veth: 1.47 Gbits/sec

Undoubtedly this is a huge improvement although it can't match
OVS kernel datapath yet.

FYI: here is thr result for OVS kernel datapath:

  tap:  37.2 Gbits/sec
  veth: 36.3 Gbits/sec

Note: performance result is highly related with your test machine
, you shouldn't expect the same results on your test machine.

Changes since v1:
  - Add fix from Ben Pfaff

Signed-off-by: Yi Yang <yangyi01@inspur.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
---
 include/sparse/sys/socket.h |   3 +
 lib/netdev-linux.c          | 167 +++++++++++++++++++++++++++++---------------
 2 files changed, 112 insertions(+), 58 deletions(-)

Comments

0-day Robot Dec. 18, 2019, 2:58 a.m. UTC | #1
Bleep bloop.  Greetings , I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Unexpected sign-offs from developers who are not authors or co-authors or committers: Ben Pfaff <blp@ovn.org>
Lines checked: 302, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot

Patch
diff mbox series

diff --git a/include/sparse/sys/socket.h b/include/sparse/sys/socket.h
index 4178f57..d3c3611 100644
--- a/include/sparse/sys/socket.h
+++ b/include/sparse/sys/socket.h
@@ -27,6 +27,7 @@ 
 
 typedef unsigned short int sa_family_t;
 typedef __socklen_t socklen_t;
+struct timespec;
 
 struct sockaddr {
     sa_family_t sa_family;
@@ -163,6 +164,8 @@  ssize_t recvmsg(int, struct msghdr *, int);
 ssize_t send(int, const void *, size_t, int);
 ssize_t sendmsg(int, const struct msghdr *, int);
 int sendmmsg(int, struct mmsghdr *, unsigned int, unsigned int);
+int recvmmsg(int, struct mmsghdr *, unsigned int,
+             unsigned int, struct timespec *);
 ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *,
                socklen_t);
 int setsockopt(int, int, int, const void *, socklen_t);
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index f8e59ba..3414a64 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1151,90 +1151,147 @@  auxdata_has_vlan_tci(const struct tpacket_auxdata *aux)
     return aux->tp_vlan_tci || aux->tp_status & TP_STATUS_VLAN_VALID;
 }
 
+/*
+ * Receive packets from raw socket in batch process for better performance,
+ * it can receive NETDEV_MAX_BURST packets at most once, the received
+ * packets are added into *batch. The return value is 0 or errno.
+ *
+ * It also used recvmmsg to reduce multiple syscalls overhead;
+ */
 static int
-netdev_linux_rxq_recv_sock(int fd, struct dp_packet *buffer)
+netdev_linux_batch_rxq_recv_sock(int fd, int mtu,
+                                 struct dp_packet_batch *batch)
 {
     size_t size;
     ssize_t retval;
-    struct iovec iov;
+    struct iovec iovs[NETDEV_MAX_BURST];
     struct cmsghdr *cmsg;
     union {
         struct cmsghdr cmsg;
         char buffer[CMSG_SPACE(sizeof(struct tpacket_auxdata))];
-    } cmsg_buffer;
-    struct msghdr msgh;
-
-    /* Reserve headroom for a single VLAN tag */
-    dp_packet_reserve(buffer, VLAN_HEADER_LEN);
-    size = dp_packet_tailroom(buffer);
-
-    iov.iov_base = dp_packet_data(buffer);
-    iov.iov_len = size;
-    msgh.msg_name = NULL;
-    msgh.msg_namelen = 0;
-    msgh.msg_iov = &iov;
-    msgh.msg_iovlen = 1;
-    msgh.msg_control = &cmsg_buffer;
-    msgh.msg_controllen = sizeof cmsg_buffer;
-    msgh.msg_flags = 0;
+    } cmsg_buffers[NETDEV_MAX_BURST];
+    struct mmsghdr mmsgs[NETDEV_MAX_BURST];
+    struct dp_packet *buffers[NETDEV_MAX_BURST];
+    int i;
+
+    for (i = 0; i < NETDEV_MAX_BURST; i++) {
+         buffers[i] = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
+                                                  DP_NETDEV_HEADROOM);
+         /* Reserve headroom for a single VLAN tag */
+         dp_packet_reserve(buffers[i], VLAN_HEADER_LEN);
+         size = dp_packet_tailroom(buffers[i]);
+         iovs[i].iov_base = dp_packet_data(buffers[i]);
+         iovs[i].iov_len = size;
+         mmsgs[i].msg_hdr.msg_name = NULL;
+         mmsgs[i].msg_hdr.msg_namelen = 0;
+         mmsgs[i].msg_hdr.msg_iov = &iovs[i];
+         mmsgs[i].msg_hdr.msg_iovlen = 1;
+         mmsgs[i].msg_hdr.msg_control = &cmsg_buffers[i];
+         mmsgs[i].msg_hdr.msg_controllen = sizeof cmsg_buffers[i];
+         mmsgs[i].msg_hdr.msg_flags = 0;
+    }
 
     do {
-        retval = recvmsg(fd, &msgh, MSG_TRUNC);
+        retval = recvmmsg(fd, mmsgs, NETDEV_MAX_BURST, MSG_TRUNC, NULL);
     } while (retval < 0 && errno == EINTR);
 
     if (retval < 0) {
-        return errno;
-    } else if (retval > size) {
-        return EMSGSIZE;
+        /* Save -errno to retval temporarily */
+        retval = -errno;
+        i = 0;
+        goto free_buffers;
     }
 
-    dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
-
-    for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg; cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
-        const struct tpacket_auxdata *aux;
-
-        if (cmsg->cmsg_level != SOL_PACKET
-            || cmsg->cmsg_type != PACKET_AUXDATA
-            || cmsg->cmsg_len < CMSG_LEN(sizeof(struct tpacket_auxdata))) {
-            continue;
+    for (i = 0; i < retval; i++) {
+        if (mmsgs[i].msg_len < ETH_HEADER_LEN) {
+            break;
         }
 
-        aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
-        if (auxdata_has_vlan_tci(aux)) {
-            struct eth_header *eth;
-            bool double_tagged;
+        dp_packet_set_size(buffers[i],
+                           dp_packet_size(buffers[i]) + mmsgs[i].msg_len);
+
+        for (cmsg = CMSG_FIRSTHDR(&mmsgs[i].msg_hdr); cmsg;
+                 cmsg = CMSG_NXTHDR(&mmsgs[i].msg_hdr, cmsg)) {
+            const struct tpacket_auxdata *aux;
 
-            if (retval < ETH_HEADER_LEN) {
-                return EINVAL;
+            if (cmsg->cmsg_level != SOL_PACKET
+                || cmsg->cmsg_type != PACKET_AUXDATA
+                || cmsg->cmsg_len <
+                       CMSG_LEN(sizeof(struct tpacket_auxdata))) {
+                continue;
             }
 
-            eth = dp_packet_data(buffer);
-            double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
+            aux = ALIGNED_CAST(struct tpacket_auxdata *, CMSG_DATA(cmsg));
+            if (auxdata_has_vlan_tci(aux)) {
+                struct eth_header *eth;
+                bool double_tagged;
 
-            eth_push_vlan(buffer, auxdata_to_vlan_tpid(aux, double_tagged),
-                          htons(aux->tp_vlan_tci));
-            break;
+                eth = dp_packet_data(buffers[i]);
+                double_tagged = eth->eth_type == htons(ETH_TYPE_VLAN_8021Q);
+
+                eth_push_vlan(buffers[i],
+                              auxdata_to_vlan_tpid(aux, double_tagged),
+                              htons(aux->tp_vlan_tci));
+                break;
+            }
         }
+        dp_packet_batch_add(batch, buffers[i]);
+    }
+
+free_buffers:
+    /* Free unused buffers, including buffers whose size is less than
+     * ETH_HEADER_LEN.
+     *
+     * Note: i has been set correctly by the above for loop, so don't
+     * try to re-initialize it.
+     */
+    for (; i < NETDEV_MAX_BURST; i++) {
+        dp_packet_delete(buffers[i]);
+    }
+
+    /* netdev_linux_rxq_recv needs it to return 0 or positive errno */
+    if (retval < 0) {
+        return -retval;
     }
 
     return 0;
 }
 
+/*
+ * Receive packets from tap by batch process for better performance,
+ * it can receive NETDEV_MAX_BURST packets at most once, the received
+ * packets are added into *batch. The return value is 0 or errno.
+ */
 static int
-netdev_linux_rxq_recv_tap(int fd, struct dp_packet *buffer)
+netdev_linux_batch_rxq_recv_tap(int fd, int mtu, struct dp_packet_batch *batch)
 {
+    struct dp_packet *buffer;
     ssize_t retval;
-    size_t size = dp_packet_tailroom(buffer);
+    size_t size;
+    int i;
 
-    do {
-        retval = read(fd, dp_packet_data(buffer), size);
-    } while (retval < 0 && errno == EINTR);
+    for (i = 0; i < NETDEV_MAX_BURST; i++) {
+        /* Assume Ethernet port. No need to set packet_type. */
+        buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
+                                             DP_NETDEV_HEADROOM);
+        size = dp_packet_tailroom(buffer);
+        do {
+            retval = read(fd, dp_packet_data(buffer), size);
+        } while (retval < 0 && errno == EINTR);
 
-    if (retval < 0) {
+        if (retval < 0) {
+            dp_packet_delete(buffer);
+            break;
+        }
+
+        dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
+        dp_packet_batch_add(batch, buffer);
+    }
+
+    if ((i == 0) && (retval < 0)) {
         return errno;
     }
 
-    dp_packet_set_size(buffer, dp_packet_size(buffer) + retval);
     return 0;
 }
 
@@ -1244,7 +1301,6 @@  netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
 {
     struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
     struct netdev *netdev = rx->up.netdev;
-    struct dp_packet *buffer;
     ssize_t retval;
     int mtu;
 
@@ -1252,21 +1308,16 @@  netdev_linux_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
         mtu = ETH_PAYLOAD_MAX;
     }
 
-    /* Assume Ethernet port. No need to set packet_type. */
-    buffer = dp_packet_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
-                                           DP_NETDEV_HEADROOM);
+    dp_packet_batch_init(batch);
     retval = (rx->is_tap
-              ? netdev_linux_rxq_recv_tap(rx->fd, buffer)
-              : netdev_linux_rxq_recv_sock(rx->fd, buffer));
+              ? netdev_linux_batch_rxq_recv_tap(rx->fd, mtu, batch)
+              : netdev_linux_batch_rxq_recv_sock(rx->fd, mtu, batch));
 
     if (retval) {
         if (retval != EAGAIN && retval != EMSGSIZE) {
             VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
                          netdev_rxq_get_name(rxq_), ovs_strerror(errno));
         }
-        dp_packet_delete(buffer);
-    } else {
-        dp_packet_batch_init_packet(batch, buffer);
     }
 
     if (qfill) {