[ovs-dev,2/2] netdev-afxdp: Add need_wakeup supprt.
diff mbox series

Message ID 1566860545-12866-1-git-send-email-u9012063@gmail.com
State Changes Requested
Headers show
Series
  • [ovs-dev,1/2] ovs-thread: Make struct spin lock cache aligned.
Related show

Commit Message

William Tu Aug. 26, 2019, 11:02 p.m. UTC
The patch adds support for using need_wakeup flag in AF_XDP rings.
When this flag is used, it means that OVS has to explicitly wake
up the kernel RX, using poll() syscall and wake up TX, using sendto()
syscall. This feature improves the performance by avoiding unnecessary
syscalls, so keeping more CPU time in userspace to process packets.

On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
to physical port improves from 6.1Mpps to 7.3Mpps.

Suggested-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: William Tu <u9012063@gmail.com>
---
 acinclude.m4       |  5 +++++
 lib/netdev-afxdp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

Comments

Eelco Chaudron Sept. 4, 2019, 8:04 a.m. UTC | #1
On 27 Aug 2019, at 1:02, William Tu wrote:

> The patch adds support for using need_wakeup flag in AF_XDP rings.
> When this flag is used, it means that OVS has to explicitly wake
> up the kernel RX, using poll() syscall and wake up TX, using sendto()
> syscall. This feature improves the performance by avoiding unnecessary
> syscalls, so keeping more CPU time in userspace to process packets.
>
> On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
> to physical port improves from 6.1Mpps to 7.3Mpps.

Did some more testing and with PVP I see a performance decrease, with 
physical to physical I see an increase.
Tests are performed with a port redirect open flow rule on an ixgbe 
(Xeon E5-2690 v4 2.60GHz):

+-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
|  PVP      | Number of flows |   64    |   128   |   256   |   512   |  
  768   |   1024  |  1514  |
+-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
| master    |            1000 |  737896 |  700643 |  682915 |  648386 |  
621792 |  582821 | 527899 |
| Patch     |            1000 |  734179 |  696515 |  676712 |  646536 |  
619600 |  578546 | 519965 |
+-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+

+-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
| Port2Port | Number of flows |   64    |   128   |   256   |   512   |  
  768   |  1024   |  1514  |
+-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
| master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598 | 
1586276 | 1197304 | 814854 |
| Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593 | 
1586276 | 1197304 | 814854 |
+-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+

Did not research why PVP is slower, maybe related to the TAP interface 
with AF_XDP?

See one small comment inline…


> Suggested-by: Eelco Chaudron <echaudro@redhat.com>
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>  acinclude.m4       |  5 +++++
>  lib/netdev-afxdp.c | 51 
> +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 56 insertions(+)
>
> diff --git a/acinclude.m4 b/acinclude.m4
> index 116ffcf9096d..8821b821ec3c 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
>                [Define to 1 if AF_XDP support is available and 
> enabled.])
>      LIBBPF_LDADD=" -lbpf -lelf"
>      AC_SUBST([LIBBPF_LDADD])
> +
> +    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
> +      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
> +        [XDP need wakeup support detected in xsk.h.])
> +    ], [], [#include <bpf/xsk.h>])
>    fi
>    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
>  ])
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index e5b058d08a09..d14d100e8fa3 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -26,6 +26,7 @@
>  #include <linux/rtnetlink.h>
>  #include <linux/if_xdp.h>
>  #include <net/if.h>
> +#include <poll.h>
>  #include <stdlib.h>
>  #include <sys/resource.h>
>  #include <sys/socket.h>
> @@ -117,6 +118,48 @@ struct xsk_socket_info {
>      atomic_uint64_t tx_dropped;
>  };
>
> +#ifdef HAVE_XDP_NEED_WAKEUP
> +static inline void
> +xsk_rx_need_wakeup(struct xsk_umem_info *umem,
> +                   struct netdev *netdev, int fd) {
> +    struct pollfd pfd;
> +    int ret;
> +
> +    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
> +        pfd.fd = fd;
> +        pfd.events = POLLIN;
> +
> +        ret = poll(&pfd, 1, 1000);

Think we should call the poll with a 0 delay as the kernel should have 
something waiting. If for some reason it doesn’t we should not wait 
for a second.

I ran the patch with a 0 timeout, and see no performance differences, or 
any of the below warning messages.

> +        if (OVS_UNLIKELY(ret == 0)) {
> +            VLOG_WARN_RL(&rl, "%s: poll rx fd timeout.",
> +                    netdev_get_name(netdev));
> +        } else if (OVS_UNLIKELY(ret < 0)) {
> +            VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
> +                    netdev_get_name(netdev),
> +                    ovs_strerror(ret));
> +        }
> +    }
> +}
> +
> +static inline bool
> +xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info)
> +{
> +    return xsk_ring_prod__needs_wakeup(&xsk_info->tx);
> +}
> +#else
> +static inline void
> +xsk_rx_need_wakeup(struct xsk_umem_info *umem OVS_UNUSED,
> +                   struct netdev *netdev OVS_UNUSED, int fd 
> OVS_UNUSED) {
> +    /* Nothing. */
> +}
> +
> +static inline bool
> +xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED)
> +{
> +    return true;
> +}
> +#endif /* HAVE_XDP_NEED_WAKEUP */
> +
>  static void
>  netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
>  {
> @@ -257,6 +300,10 @@ xsk_configure_socket(struct xsk_umem_info *umem, 
> uint32_t ifindex,
>          cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | 
> XDP_FLAGS_SKB_MODE;
>      }
>
> +#ifdef HAVE_XDP_NEED_WAKEUP
> +    cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
> +#endif
> +
>      if (if_indextoname(ifindex, devname) == NULL) {
>          VLOG_ERR("ifindex %d to devname failed (%s)",
>                   ifindex, ovs_strerror(errno));
> @@ -660,6 +707,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, 
> struct dp_packet_batch *batch,
>
>      rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
>      if (!rcvd) {
> +        xsk_rx_need_wakeup(umem, netdev, rx->fd);
>          return EAGAIN;
>      }
>
> @@ -709,6 +757,9 @@ kick_tx(struct xsk_socket_info *xsk_info, int 
> xdpmode)
>      int ret, retries;
>      static const int KERNEL_TX_BATCH_SIZE = 16;
>
> +    if (!xsk_tx_need_wakeup(xsk_info)) {
> +        return 0;
> +    }
>      /* In SKB_MODE packet transmission is synchronous, and the kernel 
> xmits
>       * only TX_BATCH_SIZE(16) packets for a single sendmsg syscall.
>       * So, we have to kick the kernel (n_packets / 16) times to be 
> sure that
> -- 
> 2.7.4
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Ilya Maximets Sept. 4, 2019, 12:19 p.m. UTC | #2
Hi William,

Thanks for the patch!
One general comment is that we, probably, should make this feature
configurable.  There are 2 reasons:

1. We might want to run OVS on older kernel while building with more
   recent libbpf. In this case socket creation will fail due to
   unsupported flag and we'll not be able to make it work.
2. Performance impact is not always positive. It depends on the
   number of available CPU cores, port types (phisical or virtual),
   interrupts affinity.  And this was proved by test results from
   Eelco.  So, it might be good to have control over the enabling/
   disabling the feature.

However, I think that it's better to keep it enabled by default.

Some comments inline.

Best regards, Ilya Maximets.

On 27.08.2019 2:02, William Tu wrote:
> The patch adds support for using need_wakeup flag in AF_XDP rings.
> When this flag is used, it means that OVS has to explicitly wake
> up the kernel RX, using poll() syscall and wake up TX, using sendto()
> syscall. This feature improves the performance by avoiding unnecessary
> syscalls, so keeping more CPU time in userspace to process packets.
> 
> On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
> to physical port improves from 6.1Mpps to 7.3Mpps.
> 
> Suggested-by: Eelco Chaudron <echaudro@redhat.com>

Wasn't it me? :) Just curious.

> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>  acinclude.m4       |  5 +++++
>  lib/netdev-afxdp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 56 insertions(+)
> 
> diff --git a/acinclude.m4 b/acinclude.m4
> index 116ffcf9096d..8821b821ec3c 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
>                [Define to 1 if AF_XDP support is available and enabled.])
>      LIBBPF_LDADD=" -lbpf -lelf"
>      AC_SUBST([LIBBPF_LDADD])
> +
> +    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
> +      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
> +        [XDP need wakeup support detected in xsk.h.])
> +    ], [], [#include <bpf/xsk.h>])
>    fi
>    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
>  ])
> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> index e5b058d08a09..d14d100e8fa3 100644
> --- a/lib/netdev-afxdp.c
> +++ b/lib/netdev-afxdp.c
> @@ -26,6 +26,7 @@
>  #include <linux/rtnetlink.h>
>  #include <linux/if_xdp.h>
>  #include <net/if.h>
> +#include <poll.h>
>  #include <stdlib.h>
>  #include <sys/resource.h>
>  #include <sys/socket.h>
> @@ -117,6 +118,48 @@ struct xsk_socket_info {
>      atomic_uint64_t tx_dropped;
>  };
>  
> +#ifdef HAVE_XDP_NEED_WAKEUP
> +static inline void
> +xsk_rx_need_wakeup(struct xsk_umem_info *umem,

Function name is misleading, because it actually tries to wake
rx up, not just checking. Something like 'xsk_rx_wakeup' or
'xsk_rx_wakeup_if_needed' might be more suitable.
Naming suggestions are welcome.

> +                   struct netdev *netdev, int fd) {
> +    struct pollfd pfd;
> +    int ret;
> +
> +    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
> +        pfd.fd = fd;
> +        pfd.events = POLLIN;
> +
> +        ret = poll(&pfd, 1, 1000);

Agree with Eelco that we shouldn't pass 1000 there. Looks dangerous.

> +        if (OVS_UNLIKELY(ret == 0)) {
> +            VLOG_WARN_RL(&rl, "%s: poll rx fd timeout.",
> +                    netdev_get_name(netdev));

Indents are off here and below. Please, align arguments to the next
symbol after '('.

> +        } else if (OVS_UNLIKELY(ret < 0)) {
> +            VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
> +                    netdev_get_name(netdev),
> +                    ovs_strerror(ret));

'ret' is always -1 on failure. errno should be there.

> +        }
> +    }
> +}
> +
> +static inline bool
> +xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info)
> +{
> +    return xsk_ring_prod__needs_wakeup(&xsk_info->tx);
> +}
> +#else
> +static inline void
> +xsk_rx_need_wakeup(struct xsk_umem_info *umem OVS_UNUSED,
> +                   struct netdev *netdev OVS_UNUSED, int fd OVS_UNUSED) {
> +    /* Nothing. */
> +}
> +
> +static inline bool
> +xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED)
> +{
> +    return true;
> +}
> +#endif /* HAVE_XDP_NEED_WAKEUP */
> +
>  static void
>  netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
>  {
> @@ -257,6 +300,10 @@ xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
>          cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE;
>      }
>  
> +#ifdef HAVE_XDP_NEED_WAKEUP
> +    cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
> +#endif
> +
>      if (if_indextoname(ifindex, devname) == NULL) {
>          VLOG_ERR("ifindex %d to devname failed (%s)",
>                   ifindex, ovs_strerror(errno));
> @@ -660,6 +707,7 @@ netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
>  
>      rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
>      if (!rcvd) {
> +        xsk_rx_need_wakeup(umem, netdev, rx->fd);
>          return EAGAIN;
>      }
>  
> @@ -709,6 +757,9 @@ kick_tx(struct xsk_socket_info *xsk_info, int xdpmode)
>      int ret, retries;
>      static const int KERNEL_TX_BATCH_SIZE = 16;
>  
> +    if (!xsk_tx_need_wakeup(xsk_info)) {
> +        return 0;
> +    }
>      /* In SKB_MODE packet transmission is synchronous, and the kernel xmits
>       * only TX_BATCH_SIZE(16) packets for a single sendmsg syscall.
>       * So, we have to kick the kernel (n_packets / 16) times to be sure that
>
William Tu Sept. 4, 2019, 1:50 p.m. UTC | #3
Hi Eelco,

Thanks for your testing and review.

On Wed, Sep 4, 2019 at 1:04 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>
>
>
> On 27 Aug 2019, at 1:02, William Tu wrote:
>
> > The patch adds support for using need_wakeup flag in AF_XDP rings.
> > When this flag is used, it means that OVS has to explicitly wake
> > up the kernel RX, using poll() syscall and wake up TX, using sendto()
> > syscall. This feature improves the performance by avoiding unnecessary
> > syscalls, so keeping more CPU time in userspace to process packets.
> >
> > On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
> > to physical port improves from 6.1Mpps to 7.3Mpps.
>
> Did some more testing and with PVP I see a performance decrease, with
> physical to physical I see an increase.
> Tests are performed with a port redirect open flow rule on an ixgbe
> (Xeon E5-2690 v4 2.60GHz):
>
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> |  PVP      | Number of flows |   64    |   128   |   256   |   512   |
>   768   |   1024  |  1514  |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> | master    |            1000 |  737896 |  700643 |  682915 |  648386 |
> 621792 |  582821 | 527899 |
> | Patch     |            1000 |  734179 |  696515 |  676712 |  646536 |
> 619600 |  578546 | 519965 |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> | Port2Port | Number of flows |   64    |   128   |   256   |   512   |
>   768   |  1024   |  1514  |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> | master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598 |
> 1586276 | 1197304 | 814854 |
> | Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593 |
> 1586276 | 1197304 | 814854 |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>
> Did not research why PVP is slower, maybe related to the TAP interface
> with AF_XDP?
>
I haven't tried PVP with this feature.
Maybe for virtual device, we don't need "need_wakeup" feature.
Let me investigate more.

> See one small comment inline…
>
>
> > Suggested-by: Eelco Chaudron <echaudro@redhat.com>
> > Signed-off-by: William Tu <u9012063@gmail.com>
> > ---
> >  acinclude.m4       |  5 +++++
> >  lib/netdev-afxdp.c | 51
> > +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 56 insertions(+)
> >
> > diff --git a/acinclude.m4 b/acinclude.m4
> > index 116ffcf9096d..8821b821ec3c 100644
> > --- a/acinclude.m4
> > +++ b/acinclude.m4
> > @@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
> >                [Define to 1 if AF_XDP support is available and
> > enabled.])
> >      LIBBPF_LDADD=" -lbpf -lelf"
> >      AC_SUBST([LIBBPF_LDADD])
> > +
> > +    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
> > +      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
> > +        [XDP need wakeup support detected in xsk.h.])
> > +    ], [], [#include <bpf/xsk.h>])
> >    fi
> >    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
> >  ])
> > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> > index e5b058d08a09..d14d100e8fa3 100644
> > --- a/lib/netdev-afxdp.c
> > +++ b/lib/netdev-afxdp.c
> > @@ -26,6 +26,7 @@
> >  #include <linux/rtnetlink.h>
> >  #include <linux/if_xdp.h>
> >  #include <net/if.h>
> > +#include <poll.h>
> >  #include <stdlib.h>
> >  #include <sys/resource.h>
> >  #include <sys/socket.h>
> > @@ -117,6 +118,48 @@ struct xsk_socket_info {
> >      atomic_uint64_t tx_dropped;
> >  };
> >
> > +#ifdef HAVE_XDP_NEED_WAKEUP
> > +static inline void
> > +xsk_rx_need_wakeup(struct xsk_umem_info *umem,
> > +                   struct netdev *netdev, int fd) {
> > +    struct pollfd pfd;
> > +    int ret;
> > +
> > +    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
> > +        pfd.fd = fd;
> > +        pfd.events = POLLIN;
> > +
> > +        ret = poll(&pfd, 1, 1000);
>
> Think we should call the poll with a 0 delay as the kernel should have
> something waiting. If for some reason it doesn’t we should not wait
> for a second.
>
> I ran the patch with a 0 timeout, and see no performance differences, or
> any of the below warning messages.
>

Agree,
Thanks!
William
William Tu Sept. 4, 2019, 2:04 p.m. UTC | #4
Hi Ilya,

Thanks for the feedback.

On Wed, Sep 4, 2019 at 5:19 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> Hi William,
>
> Thanks for the patch!
> One general comment is that we, probably, should make this feature
> configurable.  There are 2 reasons:
>
> 1. We might want to run OVS on older kernel while building with more
>    recent libbpf. In this case socket creation will fail due to
>    unsupported flag and we'll not be able to make it work.

I think it will also work, it falls back to old behavior.
From:
commit 77cd0d7b3f257fd0e3096b4fdcff1a7d38e99e10
Author: Magnus Karlsson <magnus.karlsson@intel.com>
Date:   Wed Aug 14 09:27:17 2019 +0200
"
   This flag needs some simple driver support. If the driver does not
    support this, the Rx flag is always zero and the Tx flag is always
    one. This makes any application relying on this feature default to the
    old behaviour of not requiring any syscalls in the Rx path and always
    having to call sendto() in the Tx path.
"

> 2. Performance impact is not always positive. It depends on the
>    number of available CPU cores, port types (phisical or virtual),
>    interrupts affinity.  And this was proved by test results from
>    Eelco.  So, it might be good to have control over the enabling/
>    disabling the feature.
>
> However, I think that it's better to keep it enabled by default.

OK, I will make it enabled by default, and add another option.
"options:need_wakeup=false" for user to disable it.

>
> Some comments inline.
>
> Best regards, Ilya Maximets.
>
> On 27.08.2019 2:02, William Tu wrote:
> > The patch adds support for using need_wakeup flag in AF_XDP rings.
> > When this flag is used, it means that OVS has to explicitly wake
> > up the kernel RX, using poll() syscall and wake up TX, using sendto()
> > syscall. This feature improves the performance by avoiding unnecessary
> > syscalls, so keeping more CPU time in userspace to process packets.
> >
> > On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
> > to physical port improves from 6.1Mpps to 7.3Mpps.
> >
> > Suggested-by: Eelco Chaudron <echaudro@redhat.com>
>
> Wasn't it me? :) Just curious.

Went back to check the previous email, and it was you not Eelco.
Sorry for the mistake!

>
> > Signed-off-by: William Tu <u9012063@gmail.com>
> > ---
> >  acinclude.m4       |  5 +++++
> >  lib/netdev-afxdp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 56 insertions(+)
> >
> > diff --git a/acinclude.m4 b/acinclude.m4
> > index 116ffcf9096d..8821b821ec3c 100644
> > --- a/acinclude.m4
> > +++ b/acinclude.m4
> > @@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
> >                [Define to 1 if AF_XDP support is available and enabled.])
> >      LIBBPF_LDADD=" -lbpf -lelf"
> >      AC_SUBST([LIBBPF_LDADD])
> > +
> > +    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
> > +      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
> > +        [XDP need wakeup support detected in xsk.h.])
> > +    ], [], [#include <bpf/xsk.h>])
> >    fi
> >    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
> >  ])
> > diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
> > index e5b058d08a09..d14d100e8fa3 100644
> > --- a/lib/netdev-afxdp.c
> > +++ b/lib/netdev-afxdp.c
> > @@ -26,6 +26,7 @@
> >  #include <linux/rtnetlink.h>
> >  #include <linux/if_xdp.h>
> >  #include <net/if.h>
> > +#include <poll.h>
> >  #include <stdlib.h>
> >  #include <sys/resource.h>
> >  #include <sys/socket.h>
> > @@ -117,6 +118,48 @@ struct xsk_socket_info {
> >      atomic_uint64_t tx_dropped;
> >  };
> >
> > +#ifdef HAVE_XDP_NEED_WAKEUP
> > +static inline void
> > +xsk_rx_need_wakeup(struct xsk_umem_info *umem,
>
> Function name is misleading, because it actually tries to wake
> rx up, not just checking. Something like 'xsk_rx_wakeup' or
> 'xsk_rx_wakeup_if_needed' might be more suitable.
> Naming suggestions are welcome.

OK, thanks
>
> > +                   struct netdev *netdev, int fd) {
> > +    struct pollfd pfd;
> > +    int ret;
> > +
> > +    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
> > +        pfd.fd = fd;
> > +        pfd.events = POLLIN;
> > +
> > +        ret = poll(&pfd, 1, 1000);
>
> Agree with Eelco that we shouldn't pass 1000 there. Looks dangerous.
yes

>
> > +        if (OVS_UNLIKELY(ret == 0)) {
> > +            VLOG_WARN_RL(&rl, "%s: poll rx fd timeout.",
> > +                    netdev_get_name(netdev));
>
> Indents are off here and below. Please, align arguments to the next
> symbol after '('.
>
> > +        } else if (OVS_UNLIKELY(ret < 0)) {
> > +            VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
> > +                    netdev_get_name(netdev),
> > +                    ovs_strerror(ret));
>
> 'ret' is always -1 on failure. errno should be there.

OK will fix the above two.

William
Ilya Maximets Sept. 4, 2019, 2:10 p.m. UTC | #5
> Hi Eelco,
> 
> Thanks for your testing and review.
> 
> On Wed, Sep 4, 2019 at 1:04 AM Eelco Chaudron <echaudro at redhat.com> wrote:
>>
>>
>>
>> On 27 Aug 2019, at 1:02, William Tu wrote:
>>
>> > The patch adds support for using need_wakeup flag in AF_XDP rings.
>> > When this flag is used, it means that OVS has to explicitly wake
>> > up the kernel RX, using poll() syscall and wake up TX, using sendto()
>> > syscall. This feature improves the performance by avoiding unnecessary
>> > syscalls, so keeping more CPU time in userspace to process packets.
>> >
>> > On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
>> > to physical port improves from 6.1Mpps to 7.3Mpps.
>>
>> Did some more testing and with PVP I see a performance decrease, with
>> physical to physical I see an increase.
>> Tests are performed with a port redirect open flow rule on an ixgbe
>> (Xeon E5-2690 v4 2.60GHz):
>>
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> |  PVP      | Number of flows |   64    |   128   |   256   |   512   |
>>   768   |   1024  |  1514  |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> | master    |            1000 |  737896 |  700643 |  682915 |  648386 |
>> 621792 |  582821 | 527899 |
>> | Patch     |            1000 |  734179 |  696515 |  676712 |  646536 |
>> 619600 |  578546 | 519965 |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>>
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> | Port2Port | Number of flows |   64    |   128   |   256   |   512   |
>>   768   |  1024   |  1514  |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> | master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598 |
>> 1586276 | 1197304 | 814854 |
>> | Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593 |
>> 1586276 | 1197304 | 814854 |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>>
>> Did not research why PVP is slower, maybe related to the TAP interface
>> with AF_XDP?
>>
> I haven't tried PVP with this feature.
> Maybe for virtual device, we don't need "need_wakeup" feature.
> Let me investigate more.

For virtual devices xmit is synchronous, so it always needs wakeup, i.e.
flag is always set.  In my spare time I'm working on kernel thread to
poll tx queue in SKB mode by the analogue with SQ_POLL in io_uring, but
I'm not sure if it will have good performance impact.

Best regards, Ilya Maximets.
Ilya Maximets Sept. 4, 2019, 2:27 p.m. UTC | #6
On 04.09.2019 17:04, William Tu wrote:
> Hi Ilya,
> 
> Thanks for the feedback.
> 
> On Wed, Sep 4, 2019 at 5:19 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>>
>> Hi William,
>>
>> Thanks for the patch!
>> One general comment is that we, probably, should make this feature
>> configurable.  There are 2 reasons:
>>
>> 1. We might want to run OVS on older kernel while building with more
>>    recent libbpf. In this case socket creation will fail due to
>>    unsupported flag and we'll not be able to make it work.
> 
> I think it will also work, it falls back to old behavior.
> From:
> commit 77cd0d7b3f257fd0e3096b4fdcff1a7d38e99e10
> Author: Magnus Karlsson <magnus.karlsson@intel.com>
> Date:   Wed Aug 14 09:27:17 2019 +0200
> "
>    This flag needs some simple driver support. If the driver does not
>     support this, the Rx flag is always zero and the Tx flag is always
>     one. This makes any application relying on this feature default to the
>     old behaviour of not requiring any syscalls in the Rx path and always
>     having to call sendto() in the Tx path.
> "

This part is about relation between xdp subsystem and the device driver.
If device driver doesn't support this feature, xdp subsystem will handle
this by always exposing need_wakeup flag on rings.
However, if bind_flags contains XDP_USE_NEED_WAKEUP, but xdp subsystem
doesn't support it, userspace/libbpf will get failure on socket creation.
To be honest, I didn't test that, but I think that it works this way.
You may try building OVS with libbpf from bpf-next and run it on master
kernel (without need_wakeup support).

> 
>> 2. Performance impact is not always positive. It depends on the
>>    number of available CPU cores, port types (phisical or virtual),
>>    interrupts affinity.  And this was proved by test results from
>>    Eelco.  So, it might be good to have control over the enabling/
>>    disabling the feature.
>>
>> However, I think that it's better to keep it enabled by default.
> 
> OK, I will make it enabled by default, and add another option.
> "options:need_wakeup=false" for user to disable it.

This might be better to call it 'options:use_need_wakeup' because
'need_wakeup=false' sounds like we don't need to ever wake it up,
but it's opposite in practice.

> 
>>
>> Some comments inline.
>>
>> Best regards, Ilya Maximets.
>>
>> On 27.08.2019 2:02, William Tu wrote:
>>> The patch adds support for using need_wakeup flag in AF_XDP rings.
>>> When this flag is used, it means that OVS has to explicitly wake
>>> up the kernel RX, using poll() syscall and wake up TX, using sendto()
>>> syscall. This feature improves the performance by avoiding unnecessary
>>> syscalls, so keeping more CPU time in userspace to process packets.
>>>
>>> On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
>>> to physical port improves from 6.1Mpps to 7.3Mpps.
>>>
>>> Suggested-by: Eelco Chaudron <echaudro@redhat.com>
>>
>> Wasn't it me? :) Just curious.
> 
> Went back to check the previous email, and it was you not Eelco.
> Sorry for the mistake!

It's OK.

> 
>>
>>> Signed-off-by: William Tu <u9012063@gmail.com>
>>> ---
>>>  acinclude.m4       |  5 +++++
>>>  lib/netdev-afxdp.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 56 insertions(+)
>>>
>>> diff --git a/acinclude.m4 b/acinclude.m4
>>> index 116ffcf9096d..8821b821ec3c 100644
>>> --- a/acinclude.m4
>>> +++ b/acinclude.m4
>>> @@ -276,6 +276,11 @@ AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
>>>                [Define to 1 if AF_XDP support is available and enabled.])
>>>      LIBBPF_LDADD=" -lbpf -lelf"
>>>      AC_SUBST([LIBBPF_LDADD])
>>> +
>>> +    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
>>> +      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
>>> +        [XDP need wakeup support detected in xsk.h.])
>>> +    ], [], [#include <bpf/xsk.h>])
>>>    fi
>>>    AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
>>>  ])
>>> diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
>>> index e5b058d08a09..d14d100e8fa3 100644
>>> --- a/lib/netdev-afxdp.c
>>> +++ b/lib/netdev-afxdp.c
>>> @@ -26,6 +26,7 @@
>>>  #include <linux/rtnetlink.h>
>>>  #include <linux/if_xdp.h>
>>>  #include <net/if.h>
>>> +#include <poll.h>
>>>  #include <stdlib.h>
>>>  #include <sys/resource.h>
>>>  #include <sys/socket.h>
>>> @@ -117,6 +118,48 @@ struct xsk_socket_info {
>>>      atomic_uint64_t tx_dropped;
>>>  };
>>>
>>> +#ifdef HAVE_XDP_NEED_WAKEUP
>>> +static inline void
>>> +xsk_rx_need_wakeup(struct xsk_umem_info *umem,
>>
>> Function name is misleading, because it actually tries to wake
>> rx up, not just checking. Something like 'xsk_rx_wakeup' or
>> 'xsk_rx_wakeup_if_needed' might be more suitable.
>> Naming suggestions are welcome.
> 
> OK, thanks
>>
>>> +                   struct netdev *netdev, int fd) {
>>> +    struct pollfd pfd;
>>> +    int ret;
>>> +
>>> +    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
>>> +        pfd.fd = fd;
>>> +        pfd.events = POLLIN;
>>> +
>>> +        ret = poll(&pfd, 1, 1000);
>>
>> Agree with Eelco that we shouldn't pass 1000 there. Looks dangerous.
> yes
> 
>>
>>> +        if (OVS_UNLIKELY(ret == 0)) {
>>> +            VLOG_WARN_RL(&rl, "%s: poll rx fd timeout.",
>>> +                    netdev_get_name(netdev));
>>
>> Indents are off here and below. Please, align arguments to the next
>> symbol after '('.
>>
>>> +        } else if (OVS_UNLIKELY(ret < 0)) {
>>> +            VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
>>> +                    netdev_get_name(netdev),
>>> +                    ovs_strerror(ret));
>>
>> 'ret' is always -1 on failure. errno should be there.
> 
> OK will fix the above two.
> 
> William
> 
>
William Tu Sept. 4, 2019, 6:23 p.m. UTC | #7
On Wed, Sep 4, 2019 at 7:27 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> On 04.09.2019 17:04, William Tu wrote:
> > Hi Ilya,
> >
> > Thanks for the feedback.
> >
> > On Wed, Sep 4, 2019 at 5:19 AM Ilya Maximets <i.maximets@samsung.com> wrote:
> >>
> >> Hi William,
> >>
> >> Thanks for the patch!
> >> One general comment is that we, probably, should make this feature
> >> configurable.  There are 2 reasons:
> >>
> >> 1. We might want to run OVS on older kernel while building with more
> >>    recent libbpf. In this case socket creation will fail due to
> >>    unsupported flag and we'll not be able to make it work.
> >
> > I think it will also work, it falls back to old behavior.
> > From:
> > commit 77cd0d7b3f257fd0e3096b4fdcff1a7d38e99e10
> > Author: Magnus Karlsson <magnus.karlsson@intel.com>
> > Date:   Wed Aug 14 09:27:17 2019 +0200
> > "
> >    This flag needs some simple driver support. If the driver does not
> >     support this, the Rx flag is always zero and the Tx flag is always
> >     one. This makes any application relying on this feature default to the
> >     old behaviour of not requiring any syscalls in the Rx path and always
> >     having to call sendto() in the Tx path.
> > "
>
> This part is about relation between xdp subsystem and the device driver.
> If device driver doesn't support this feature, xdp subsystem will handle
> this by always exposing need_wakeup flag on rings.
> However, if bind_flags contains XDP_USE_NEED_WAKEUP, but xdp subsystem
> doesn't support it, userspace/libbpf will get failure on socket creation.
> To be honest, I didn't test that, but I think that it works this way.
> You may try building OVS with libbpf from bpf-next and run it on master
> kernel (without need_wakeup support).

Yes, you're right, it will fail at xsk_bind when checking
    if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY) )

>
> >
> >> 2. Performance impact is not always positive. It depends on the
> >>    number of available CPU cores, port types (phisical or virtual),
> >>    interrupts affinity.  And this was proved by test results from
> >>    Eelco.  So, it might be good to have control over the enabling/
> >>    disabling the feature.
> >>
> >> However, I think that it's better to keep it enabled by default.
> >
> > OK, I will make it enabled by default, and add another option.
> > "options:need_wakeup=false" for user to disable it.
>
> This might be better to call it 'options:use_need_wakeup' because
> 'need_wakeup=false' sounds like we don't need to ever wake it up,
> but it's opposite in practice.

OK
Thanks
William
William Tu Sept. 5, 2019, 4:21 p.m. UTC | #8
> Did some more testing and with PVP I see a performance decrease, with
> physical to physical I see an increase.
> Tests are performed with a port redirect open flow rule on an ixgbe
> (Xeon E5-2690 v4 2.60GHz):
>
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> |  PVP      | Number of flows |   64    |   128   |   256   |   512   |
>   768   |   1024  |  1514  |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> | master    |            1000 |  737896 |  700643 |  682915 |  648386 |
> 621792 |  582821 | 527899 |
> | Patch     |            1000 |  734179 |  696515 |  676712 |  646536 |
> 619600 |  578546 | 519965 |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> | Port2Port | Number of flows |   64    |   128   |   256   |   512   |
>   768   |  1024   |  1514  |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> | master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598 |
> 1586276 | 1197304 | 814854 |
> | Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593 |
> 1586276 | 1197304 | 814854 |
> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+

Hi Eleco,

I'm wondering why you get only ~3.3Mpps for P2P test. Using ixgbe, I
usually get at
least 6Mpps using single flow (a single 64B UDP packet hitting single
OpenFlow rule)
Do you think it's due to have 1000 flows in your setup?

Another possible overhead is due to no rxhash in AF_XDP, so there is extra
overhead doing flow_hash_5tuple and dp_packet_set_rss_hash

Regards,
William
William Tu Sept. 5, 2019, 9:31 p.m. UTC | #9
On Wed, Sep 4, 2019 at 7:10 AM Ilya Maximets <i.maximets@samsung.com> wrote:
>
> > Hi Eelco,
> >
> > Thanks for your testing and review.
> >
> > On Wed, Sep 4, 2019 at 1:04 AM Eelco Chaudron <echaudro at redhat.com> wrote:
> >>
> >>
> >>
> >> On 27 Aug 2019, at 1:02, William Tu wrote:
> >>
> >> > The patch adds support for using need_wakeup flag in AF_XDP rings.
> >> > When this flag is used, it means that OVS has to explicitly wake
> >> > up the kernel RX, using poll() syscall and wake up TX, using sendto()
> >> > syscall. This feature improves the performance by avoiding unnecessary
> >> > syscalls, so keeping more CPU time in userspace to process packets.
> >> >
> >> > On Intel Xeon E5-2620 v3 2.4GHz system, performance of physical port
> >> > to physical port improves from 6.1Mpps to 7.3Mpps.
> >>
> >> Did some more testing and with PVP I see a performance decrease, with
> >> physical to physical I see an increase.
> >> Tests are performed with a port redirect open flow rule on an ixgbe
> >> (Xeon E5-2690 v4 2.60GHz):
> >>
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> |  PVP      | Number of flows |   64    |   128   |   256   |   512   |
> >>   768   |   1024  |  1514  |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> | master    |            1000 |  737896 |  700643 |  682915 |  648386 |
> >> 621792 |  582821 | 527899 |
> >> | Patch     |            1000 |  734179 |  696515 |  676712 |  646536 |
> >> 619600 |  578546 | 519965 |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >>
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> | Port2Port | Number of flows |   64    |   128   |   256   |   512   |
> >>   768   |  1024   |  1514  |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> | master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598 |
> >> 1586276 | 1197304 | 814854 |
> >> | Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593 |
> >> 1586276 | 1197304 | 814854 |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >>
> >> Did not research why PVP is slower, maybe related to the TAP interface
> >> with AF_XDP?
> >>
> > I haven't tried PVP with this feature.
> > Maybe for virtual device, we don't need "need_wakeup" feature.
> > Let me investigate more.
>
> For virtual devices xmit is synchronous, so it always needs wakeup, i.e.
> flag is always set.  In my spare time I'm working on kernel thread to
> poll tx queue in SKB mode by the analogue with SQ_POLL in io_uring, but
> I'm not sure if it will have good performance impact.
>
Cool.
If there is a kthread polling the SKB mode tx queue, then need_wakeup feature
can be enabled for virtual device. This will save sendto syscall overhead and
improve performance.

William
Eelco Chaudron Sept. 6, 2019, 8:13 a.m. UTC | #10
On 5 Sep 2019, at 18:21, William Tu wrote:

>> Did some more testing and with PVP I see a performance decrease, with
>> physical to physical I see an increase.
>> Tests are performed with a port redirect open flow rule on an ixgbe
>> (Xeon E5-2690 v4 2.60GHz):
>>
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> |  PVP      | Number of flows |   64    |   128   |   256   |   512   
>> |
>>   768   |   1024  |  1514  |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> | master    |            1000 |  737896 |  700643 |  682915 |  648386 
>> |
>> 621792 |  582821 | 527899 |
>> | Patch     |            1000 |  734179 |  696515 |  676712 |  646536 
>> |
>> 619600 |  578546 | 519965 |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>>
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> | Port2Port | Number of flows |   64    |   128   |   256   |   512   
>> |
>>   768   |  1024   |  1514  |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>> | master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598 
>> |
>> 1586276 | 1197304 | 814854 |
>> | Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593 
>> |
>> 1586276 | 1197304 | 814854 |
>> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
>
> Hi Eleco,
>
> I'm wondering why you get only ~3.3Mpps for P2P test. Using ixgbe, I
> usually get at
> least 6Mpps using single flow (a single 64B UDP packet hitting single
> OpenFlow rule)
> Do you think it's due to have 1000 flows in your setup?

I did a quick test and the number gets a bit better with fewer flows:

"Physical loopback test, L3 flows[port redirect]"
,Packet size
Number of flows,64
1,4111768
100,3904664
1000,3573564

I’m using a single of rule “in_port=eno1 actions=IN_PORT”, with a 
single card, i.e. the same card in and out.

> Another possible overhead is due to no rxhash in AF_XDP, so there is 
> extra
> overhead doing flow_hash_5tuple and dp_packet_set_rss_hash

But this should not influence the difference between our tests. Do you 
use a single card/port in and out? I’m using a single queue and single 
PMD thread.

> Regards,
> William
William Tu Sept. 6, 2019, 4:16 p.m. UTC | #11
On Fri, Sep 6, 2019 at 1:14 AM Eelco Chaudron <echaudro@redhat.com> wrote:
>
>
>
> On 5 Sep 2019, at 18:21, William Tu wrote:
>
> >> Did some more testing and with PVP I see a performance decrease, with
> >> physical to physical I see an increase.
> >> Tests are performed with a port redirect open flow rule on an ixgbe
> >> (Xeon E5-2690 v4 2.60GHz):
> >>
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> |  PVP      | Number of flows |   64    |   128   |   256   |   512
> >> |
> >>   768   |   1024  |  1514  |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> | master    |            1000 |  737896 |  700643 |  682915 |  648386
> >> |
> >> 621792 |  582821 | 527899 |
> >> | Patch     |            1000 |  734179 |  696515 |  676712 |  646536
> >> |
> >> 619600 |  578546 | 519965 |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >>
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> | Port2Port | Number of flows |   64    |   128   |   256   |   512
> >> |
> >>   768   |  1024   |  1514  |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >> | master    |            1000 | 3351114 | 3236581 | 3143710 | 2349598
> >> |
> >> 1586276 | 1197304 | 814854 |
> >> | Patch     |            1000 | 3571733 | 3488697 | 3448908 | 2349593
> >> |
> >> 1586276 | 1197304 | 814854 |
> >> +-----------+-----------------+---------+---------+---------+---------+---------+---------+--------+
> >
> > Hi Eleco,
> >
> > I'm wondering why you get only ~3.3Mpps for P2P test. Using ixgbe, I
> > usually get at
> > least 6Mpps using single flow (a single 64B UDP packet hitting single
> > OpenFlow rule)
> > Do you think it's due to have 1000 flows in your setup?
>
> I did a quick test and the number gets a bit better with fewer flows:
>
> "Physical loopback test, L3 flows[port redirect]"
> ,Packet size
> Number of flows,64
> 1,4111768
> 100,3904664
> 1000,3573564
>
> I’m using a single of rule “in_port=eno1 actions=IN_PORT”, with a
> single card, i.e. the same card in and out.
>
> > Another possible overhead is due to no rxhash in AF_XDP, so there is
> > extra
> > overhead doing flow_hash_5tuple and dp_packet_set_rss_hash
>
> But this should not influence the difference between our tests. Do you
> use a single card/port in and out? I’m using a single queue and single
> PMD thread.

I use single card with dual port (ex: eth2 and eth3), and single queue with
single PMD for each netdev.

BTW, the v2 patch about need_wakeup (I forgot to CC you)...
https://patchwork.ozlabs.org/patch/1158678/

Thanks
William

Patch
diff mbox series

diff --git a/acinclude.m4 b/acinclude.m4
index 116ffcf9096d..8821b821ec3c 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -276,6 +276,11 @@  AC_DEFUN([OVS_CHECK_LINUX_AF_XDP], [
               [Define to 1 if AF_XDP support is available and enabled.])
     LIBBPF_LDADD=" -lbpf -lelf"
     AC_SUBST([LIBBPF_LDADD])
+
+    AC_CHECK_DECL([xsk_ring_prod__needs_wakeup], [
+      AC_DEFINE([HAVE_XDP_NEED_WAKEUP], [1],
+        [XDP need wakeup support detected in xsk.h.])
+    ], [], [#include <bpf/xsk.h>])
   fi
   AM_CONDITIONAL([HAVE_AF_XDP], test "$AF_XDP_ENABLE" = true)
 ])
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
index e5b058d08a09..d14d100e8fa3 100644
--- a/lib/netdev-afxdp.c
+++ b/lib/netdev-afxdp.c
@@ -26,6 +26,7 @@ 
 #include <linux/rtnetlink.h>
 #include <linux/if_xdp.h>
 #include <net/if.h>
+#include <poll.h>
 #include <stdlib.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
@@ -117,6 +118,48 @@  struct xsk_socket_info {
     atomic_uint64_t tx_dropped;
 };
 
+#ifdef HAVE_XDP_NEED_WAKEUP
+static inline void
+xsk_rx_need_wakeup(struct xsk_umem_info *umem,
+                   struct netdev *netdev, int fd) {
+    struct pollfd pfd;
+    int ret;
+
+    if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
+        pfd.fd = fd;
+        pfd.events = POLLIN;
+
+        ret = poll(&pfd, 1, 1000);
+        if (OVS_UNLIKELY(ret == 0)) {
+            VLOG_WARN_RL(&rl, "%s: poll rx fd timeout.",
+                    netdev_get_name(netdev));
+        } else if (OVS_UNLIKELY(ret < 0)) {
+            VLOG_WARN_RL(&rl, "%s: error polling rx fd: %s.",
+                    netdev_get_name(netdev),
+                    ovs_strerror(ret));
+        }
+    }
+}
+
+static inline bool
+xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info)
+{
+    return xsk_ring_prod__needs_wakeup(&xsk_info->tx);
+}
+#else
+static inline void
+xsk_rx_need_wakeup(struct xsk_umem_info *umem OVS_UNUSED,
+                   struct netdev *netdev OVS_UNUSED, int fd OVS_UNUSED) {
+    /* Nothing. */
+}
+
+static inline bool
+xsk_tx_need_wakeup(struct xsk_socket_info *xsk_info OVS_UNUSED)
+{
+    return true;
+}
+#endif /* HAVE_XDP_NEED_WAKEUP */
+
 static void
 netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
 {
@@ -257,6 +300,10 @@  xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
         cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE;
     }
 
+#ifdef HAVE_XDP_NEED_WAKEUP
+    cfg.bind_flags |= XDP_USE_NEED_WAKEUP;
+#endif
+
     if (if_indextoname(ifindex, devname) == NULL) {
         VLOG_ERR("ifindex %d to devname failed (%s)",
                  ifindex, ovs_strerror(errno));
@@ -660,6 +707,7 @@  netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
 
     rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
     if (!rcvd) {
+        xsk_rx_need_wakeup(umem, netdev, rx->fd);
         return EAGAIN;
     }
 
@@ -709,6 +757,9 @@  kick_tx(struct xsk_socket_info *xsk_info, int xdpmode)
     int ret, retries;
     static const int KERNEL_TX_BATCH_SIZE = 16;
 
+    if (!xsk_tx_need_wakeup(xsk_info)) {
+        return 0;
+    }
     /* In SKB_MODE packet transmission is synchronous, and the kernel xmits
      * only TX_BATCH_SIZE(16) packets for a single sendmsg syscall.
      * So, we have to kick the kernel (n_packets / 16) times to be sure that