diff mbox

[ovs-dev,v3] netdev-linux: Replace sendmsg with sendmmsg in netdev_linux_send

Message ID 20170717023957.7783-1-sysugaozhenyu@gmail.com
State Superseded
Headers show

Commit Message

Gao Zhenyu July 17, 2017, 2:39 a.m. UTC
Sendmmsg can reduce cpu cycles in sending packets to kernel.
Replace sendmsg with sendmmsg in function netdev_linux_send to send
batch packets if sendmmsg is available.

If kernel side doesn't support sendmmsg, will fallback to sendmsg.

    netserver
|------------|
|            |
|  container |
|----veth----|
          |
          |        |------------|
          |---veth-|   dpdk-ovs |      netperf
                   |            |  |--------------|
                   |----dpdk----|  | bare-metal   |
                         |         |--------------|
                         |              |
                         |              |
                        pnic-----------pnic

Netperf was consumed to test the performance:

1)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 1400
result: netserver received 2383.21Mb(sendmsg)/2551.64Mb(sendmmsg)

2)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 60
result: netserver received 109.72Mb(sendmsg)/115.18Mb(sendmmsg)

Sendmmsg show about 6% improvement in netperf UDP testing.

Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
---
 lib/netdev-linux.c | 85 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 23 deletions(-)

Comments

Gao Zhenyu July 27, 2017, 12:57 a.m. UTC | #1
Ping....

Thanks
Zhenyu Gao

2017-07-17 10:39 GMT+08:00 Zhenyu Gao <sysugaozhenyu@gmail.com>:

> Sendmmsg can reduce cpu cycles in sending packets to kernel.
> Replace sendmsg with sendmmsg in function netdev_linux_send to send
> batch packets if sendmmsg is available.
>
> If kernel side doesn't support sendmmsg, will fallback to sendmsg.
>
>     netserver
> |------------|
> |            |
> |  container |
> |----veth----|
>           |
>           |        |------------|
>           |---veth-|   dpdk-ovs |      netperf
>                    |            |  |--------------|
>                    |----dpdk----|  | bare-metal   |
>                          |         |--------------|
>                          |              |
>                          |              |
>                         pnic-----------pnic
>
> Netperf was consumed to test the performance:
>
> 1)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 1400
> result: netserver received 2383.21Mb(sendmsg)/2551.64Mb(sendmmsg)
>
> 2)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 60
> result: netserver received 109.72Mb(sendmsg)/115.18Mb(sendmmsg)
>
> Sendmmsg show about 6% improvement in netperf UDP testing.
>
> Signed-off-by: Zhenyu Gao <sysugaozhenyu@gmail.com>
> ---
>  lib/netdev-linux.c | 85 ++++++++++++++++++++++++++++++
> +++++++++---------------
>  1 file changed, 62 insertions(+), 23 deletions(-)
>
> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> index e1d8701..d991d05 100644
> --- a/lib/netdev-linux.c
> +++ b/lib/netdev-linux.c
> @@ -1182,6 +1182,54 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
>      }
>  }
>
> +static inline int
> +netdev_linux_sock_batch_send(int sock, struct msghdr *msg,
> +                             struct dp_packet_batch *batch)
> +{
> +    int error = 0;
> +    ssize_t retval;
> +    uint32_t resend_idx = 0;
> +    struct mmsghdr *mmsg;
> +    struct iovec *iov;
> +
> +    mmsg = xmalloc(sizeof(*mmsg) * batch->count);
> +    iov = xmalloc(sizeof(*iov) * batch->count);
> +
> +    for (int i = 0; i < batch->count; i++) {
> +        const void *data = dp_packet_data(batch->packets[i]);
> +        size_t size = dp_packet_size(batch->packets[i]);
> +
> +        /* Truncate the packet if it is configured. */
> +        size -= dp_packet_get_cutlen(batch->packets[i]);
> +
> +        iov[i].iov_base = CONST_CAST(void *, data);
> +        iov[i].iov_len = size;
> +        mmsg[i].msg_hdr = *msg;
> +        mmsg[i].msg_hdr.msg_iov = &iov[i];
> +    }
> +
> +resend_batch:
> +    retval = sendmmsg(sock, mmsg + resend_idx,
> +                      batch->count - resend_idx, 0);
> +    if (retval < 0) {
> +        if (errno == EINTR) {
> +            goto resend_batch;
> +        }
> +        /* The Linux AF_PACKET implementation never blocks waiting for
> +         * room for packets, instead returning ENOBUFS.  Translate this
> +         * into EAGAIN for the caller. */
> +        error = errno == ENOBUFS ? EAGAIN : errno;
> +    } else if (retval != batch->count - resend_idx) {
> +       /* Send remain packets again. */
> +        resend_idx += retval;
> +        goto resend_batch;
> +    }
> +
> +    free(mmsg);
> +    free(iov);
> +    return error;
> +}
> +
>  /* Sends 'buffer' on 'netdev'.  Returns 0 if successful, otherwise a
> positive
>   * errno value.  Returns EAGAIN without blocking if the packet cannot be
> queued
>   * immediately.  Returns EMSGSIZE if a partial packet was transmitted or
> if
> @@ -1226,6 +1274,9 @@ netdev_linux_send(struct netdev *netdev_, int qid
> OVS_UNUSED,
>          msg.msg_control = NULL;
>          msg.msg_controllen = 0;
>          msg.msg_flags = 0;
> +
> +        error = netdev_linux_sock_batch_send(sock, &msg, batch);
> +        goto check_error;
>      }
>
>      /* 'i' is incremented only if there's no error */
> @@ -1236,34 +1287,21 @@ netdev_linux_send(struct netdev *netdev_, int qid
> OVS_UNUSED,
>
>          /* Truncate the packet if it is configured. */
>          size -= dp_packet_get_cutlen(batch->packets[i]);
> +        /* Use the tap fd to send to this device.  This is essential for
> +         * tap devices, because packets sent to a tap device with an
> +         * AF_PACKET socket will loop back to be *received* again on the
> +         * tap device.  This doesn't occur on other interface types
> +         * because we attach a socket filter to the rx socket. */
> +        struct netdev_linux *netdev = netdev_linux_cast(netdev_);
>
> -        if (!is_tap_netdev(netdev_)) {
> -            /* Use our AF_PACKET socket to send to this device. */
> -            struct iovec iov;
> -
> -            iov.iov_base = CONST_CAST(void *, data);
> -            iov.iov_len = size;
> -
> -            msg.msg_iov = &iov;
> -
> -            retval = sendmsg(sock, &msg, 0);
> -        } else {
> -            /* Use the tap fd to send to this device.  This is essential
> for
> -             * tap devices, because packets sent to a tap device with an
> -             * AF_PACKET socket will loop back to be *received* again on
> the
> -             * tap device.  This doesn't occur on other interface types
> -             * because we attach a socket filter to the rx socket. */
> -            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> -
> -            retval = write(netdev->tap_fd, data, size);
> -        }
> +        retval = write(netdev->tap_fd, data, size);
>
>          if (retval < 0) {
>              if (errno == EINTR) {
>                  /* The send was interrupted by a signal.  Retry the
> packet by
>                   * continuing without incrementing 'i'.*/
>                  continue;
> -            } else if (errno == EIO && is_tap_netdev(netdev_)) {
> +            } else if (errno == EIO) {
>                  /* The Linux tap driver returns EIO if the device is not
> up.
>                   * From the OVS side this is not an error, so ignore it.
> */
>              } else {
> @@ -1285,9 +1323,10 @@ netdev_linux_send(struct netdev *netdev_, int qid
> OVS_UNUSED,
>          i++;
>      }
>
> +check_error:
>      if (error && error != EAGAIN) {
> -            VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
> -                         netdev_get_name(netdev_), ovs_strerror(error));
> +        VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
> +                     netdev_get_name(netdev_), ovs_strerror(error));
>      }
>
>  free_batch:
> --
> 1.8.3.1
>
>
Ben Pfaff Aug. 2, 2017, 9:59 p.m. UTC | #2
On Mon, Jul 17, 2017 at 02:39:57AM +0000, Zhenyu Gao wrote:
> Sendmmsg can reduce cpu cycles in sending packets to kernel.
> Replace sendmsg with sendmmsg in function netdev_linux_send to send
> batch packets if sendmmsg is available.

Thanks for the patch.

I like the idea but I found the details to be a little hard to follow,
especially given the multiple goto statements.

How about the following instead?
        https://patchwork.ozlabs.org/patch/796898/
        https://patchwork.ozlabs.org/patch/796899/

Thanks,

Ben.
Gao Zhenyu Aug. 3, 2017, 1:21 a.m. UTC | #3
It looks better to this independent function netdev_linux_tap_batch_send.
Thanks for working on it.


Thanks
Zhenyu Gao

2017-08-03 5:59 GMT+08:00 Ben Pfaff <blp@ovn.org>:

> On Mon, Jul 17, 2017 at 02:39:57AM +0000, Zhenyu Gao wrote:
> > Sendmmsg can reduce cpu cycles in sending packets to kernel.
> > Replace sendmsg with sendmmsg in function netdev_linux_send to send
> > batch packets if sendmmsg is available.
>
> Thanks for the patch.
>
> I like the idea but I found the details to be a little hard to follow,
> especially given the multiple goto statements.
>
> How about the following instead?
>         https://patchwork.ozlabs.org/patch/796898/
>         https://patchwork.ozlabs.org/patch/796899/
>
> Thanks,
>
> Ben.
>
diff mbox

Patch

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index e1d8701..d991d05 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -1182,6 +1182,54 @@  netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
     }
 }
 
+static inline int
+netdev_linux_sock_batch_send(int sock, struct msghdr *msg,
+                             struct dp_packet_batch *batch)
+{
+    int error = 0;
+    ssize_t retval;
+    uint32_t resend_idx = 0;
+    struct mmsghdr *mmsg;
+    struct iovec *iov;
+
+    mmsg = xmalloc(sizeof(*mmsg) * batch->count);
+    iov = xmalloc(sizeof(*iov) * batch->count);
+
+    for (int i = 0; i < batch->count; i++) {
+        const void *data = dp_packet_data(batch->packets[i]);
+        size_t size = dp_packet_size(batch->packets[i]);
+
+        /* Truncate the packet if it is configured. */
+        size -= dp_packet_get_cutlen(batch->packets[i]);
+
+        iov[i].iov_base = CONST_CAST(void *, data);
+        iov[i].iov_len = size;
+        mmsg[i].msg_hdr = *msg;
+        mmsg[i].msg_hdr.msg_iov = &iov[i];
+    }
+
+resend_batch:
+    retval = sendmmsg(sock, mmsg + resend_idx,
+                      batch->count - resend_idx, 0);
+    if (retval < 0) {
+        if (errno == EINTR) {
+            goto resend_batch;
+        }
+        /* The Linux AF_PACKET implementation never blocks waiting for
+         * room for packets, instead returning ENOBUFS.  Translate this
+         * into EAGAIN for the caller. */
+        error = errno == ENOBUFS ? EAGAIN : errno;
+    } else if (retval != batch->count - resend_idx) {
+       /* Send remain packets again. */
+        resend_idx += retval;
+        goto resend_batch;
+    }
+
+    free(mmsg);
+    free(iov);
+    return error;
+}
+
 /* Sends 'buffer' on 'netdev'.  Returns 0 if successful, otherwise a positive
  * errno value.  Returns EAGAIN without blocking if the packet cannot be queued
  * immediately.  Returns EMSGSIZE if a partial packet was transmitted or if
@@ -1226,6 +1274,9 @@  netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
         msg.msg_control = NULL;
         msg.msg_controllen = 0;
         msg.msg_flags = 0;
+
+        error = netdev_linux_sock_batch_send(sock, &msg, batch);
+        goto check_error;
     }
 
     /* 'i' is incremented only if there's no error */
@@ -1236,34 +1287,21 @@  netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
 
         /* Truncate the packet if it is configured. */
         size -= dp_packet_get_cutlen(batch->packets[i]);
+        /* Use the tap fd to send to this device.  This is essential for
+         * tap devices, because packets sent to a tap device with an
+         * AF_PACKET socket will loop back to be *received* again on the
+         * tap device.  This doesn't occur on other interface types
+         * because we attach a socket filter to the rx socket. */
+        struct netdev_linux *netdev = netdev_linux_cast(netdev_);
 
-        if (!is_tap_netdev(netdev_)) {
-            /* Use our AF_PACKET socket to send to this device. */
-            struct iovec iov;
-
-            iov.iov_base = CONST_CAST(void *, data);
-            iov.iov_len = size;
-
-            msg.msg_iov = &iov;
-
-            retval = sendmsg(sock, &msg, 0);
-        } else {
-            /* Use the tap fd to send to this device.  This is essential for
-             * tap devices, because packets sent to a tap device with an
-             * AF_PACKET socket will loop back to be *received* again on the
-             * tap device.  This doesn't occur on other interface types
-             * because we attach a socket filter to the rx socket. */
-            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
-
-            retval = write(netdev->tap_fd, data, size);
-        }
+        retval = write(netdev->tap_fd, data, size);
 
         if (retval < 0) {
             if (errno == EINTR) {
                 /* The send was interrupted by a signal.  Retry the packet by
                  * continuing without incrementing 'i'.*/
                 continue;
-            } else if (errno == EIO && is_tap_netdev(netdev_)) {
+            } else if (errno == EIO) {
                 /* The Linux tap driver returns EIO if the device is not up.
                  * From the OVS side this is not an error, so ignore it. */
             } else {
@@ -1285,9 +1323,10 @@  netdev_linux_send(struct netdev *netdev_, int qid OVS_UNUSED,
         i++;
     }
 
+check_error:
     if (error && error != EAGAIN) {
-            VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
-                         netdev_get_name(netdev_), ovs_strerror(error));
+        VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
+                     netdev_get_name(netdev_), ovs_strerror(error));
     }
 
 free_batch: