diff mbox series

[bpf-next,2/4] xsk: allow AF_XDP sockets to receive packets directly from a queue

Message ID 1570515415-45593-3-git-send-email-sridhar.samudrala@intel.com
State Awaiting Upstream
Headers show
Series Enable direct receive on AF_XDP sockets | expand

Commit Message

Samudrala, Sridhar Oct. 8, 2019, 6:16 a.m. UTC
Introduce a flag that can be specified during the bind() call
of an AF_XDP socket to receive packets directly from a queue when there is
no XDP program attached to the device.

This is enabled by introducing a special BPF prog pointer called
BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified
when binding an AF_XDP socket to a queue. At the time of bind(), an AF_XDP
socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf program
if there is no other XDP program attached to the device. The device receive
queue is also associated with the AF_XDP socket.

In the XDP receive path, if the bpf program is a DIRECT_XSK program, the
XDP buffer is passed to the AF_XDP socket that is associated with the
receive queue on which the packet is received.

To avoid any overhead for nomal XDP programs, a static key is used to keep
a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used
to handle receives for direct XDP sockets.

Any attach of a normal XDP program will take precedence and the direct xsk
program will be removed. The direct XSK program will be attached
automatically when the normal XDP program is removed when there are any
AF_XDP direct sockets associated with that device.

Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
---
 include/linux/filter.h            | 18 ++++++++++++
 include/linux/netdevice.h         | 10 +++++++
 include/net/xdp_sock.h            |  5 ++++
 include/uapi/linux/if_xdp.h       |  5 ++++
 kernel/bpf/syscall.c              |  7 +++--
 net/core/dev.c                    | 50 +++++++++++++++++++++++++++++++++
 net/core/filter.c                 | 58 +++++++++++++++++++++++++++++++++++++++
 net/xdp/xsk.c                     | 51 ++++++++++++++++++++++++++++++++--
 tools/include/uapi/linux/if_xdp.h |  5 ++++
 9 files changed, 204 insertions(+), 5 deletions(-)

Comments

Toke Høiland-Jørgensen Oct. 8, 2019, 6:58 a.m. UTC | #1
Sridhar Samudrala <sridhar.samudrala@intel.com> writes:

>  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
>  		    struct bpf_prog *xdp_prog)
>  {
>  	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>  	struct bpf_map *map = READ_ONCE(ri->map);
> +	struct xdp_sock *xsk;
> +
> +	xsk = xdp_get_direct_xsk(ri);
> +	if (xsk)
> +		return xsk_rcv(xsk, xdp);

This is a new branch and a read barrier in the XDP_REDIRECT fast path.
What's the performance impact of that for non-XSK redirect?

-Toke
Björn Töpel Oct. 8, 2019, 8:05 a.m. UTC | #2
On 2019-10-08 08:16, Sridhar Samudrala wrote:
> Introduce a flag that can be specified during the bind() call
> of an AF_XDP socket to receive packets directly from a queue when there is
> no XDP program attached to the device.
> 
> This is enabled by introducing a special BPF prog pointer called
> BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified
> when binding an AF_XDP socket to a queue. At the time of bind(), an AF_XDP
> socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf program
> if there is no other XDP program attached to the device. The device receive
> queue is also associated with the AF_XDP socket.
> 
> In the XDP receive path, if the bpf program is a DIRECT_XSK program, the
> XDP buffer is passed to the AF_XDP socket that is associated with the
> receive queue on which the packet is received.
> 
> To avoid any overhead for nomal XDP programs, a static key is used to keep
> a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used
> to handle receives for direct XDP sockets.
> 
> Any attach of a normal XDP program will take precedence and the direct xsk
> program will be removed. The direct XSK program will be attached
> automatically when the normal XDP program is removed when there are any
> AF_XDP direct sockets associated with that device.
> 
> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
> ---
>   include/linux/filter.h            | 18 ++++++++++++
>   include/linux/netdevice.h         | 10 +++++++
>   include/net/xdp_sock.h            |  5 ++++
>   include/uapi/linux/if_xdp.h       |  5 ++++
>   kernel/bpf/syscall.c              |  7 +++--
>   net/core/dev.c                    | 50 +++++++++++++++++++++++++++++++++
>   net/core/filter.c                 | 58 +++++++++++++++++++++++++++++++++++++++
>   net/xdp/xsk.c                     | 51 ++++++++++++++++++++++++++++++++--
>   tools/include/uapi/linux/if_xdp.h |  5 ++++
>   9 files changed, 204 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 2ce57645f3cd..db4ad85d8321 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -585,6 +585,9 @@ struct bpf_redirect_info {
>   	struct bpf_map *map;
>   	struct bpf_map *map_to_flush;
>   	u32 kern_flags;
> +#ifdef CONFIG_XDP_SOCKETS
> +	struct xdp_sock *xsk;
> +#endif
>   };
>   
>   DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
> @@ -693,6 +696,16 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
>   	return res;
>   }
>   
> +#ifdef CONFIG_XDP_SOCKETS
> +#define BPF_PROG_DIRECT_XSK	0x1
> +#define bpf_is_direct_xsk_prog(prog) \
> +	((unsigned long)prog == BPF_PROG_DIRECT_XSK)
> +DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp);
> +#else
> +#define bpf_is_direct_xsk_prog(prog) (false)
> +#endif
> +
>   static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
>   					    struct xdp_buff *xdp)
>   {
> @@ -702,6 +715,11 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
>   	 * already takes rcu_read_lock() when fetching the program, so
>   	 * it's not necessary here anymore.
>   	 */
> +#ifdef CONFIG_XDP_SOCKETS
> +	if (static_branch_unlikely(&xdp_direct_xsk_needed) &&
> +	    bpf_is_direct_xsk_prog(prog))
> +		return bpf_direct_xsk(prog, xdp);
> +#endif
>   	return BPF_PROG_RUN(prog, xdp);
>   }
>   
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 48cc71aae466..f4d0f70aa718 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -743,6 +743,7 @@ struct netdev_rx_queue {
>   	struct xdp_rxq_info		xdp_rxq;
>   #ifdef CONFIG_XDP_SOCKETS
>   	struct xdp_umem                 *umem;
> +	struct xdp_sock			*xsk;
>   #endif
>   } ____cacheline_aligned_in_smp;
>   
> @@ -1836,6 +1837,10 @@ struct net_device {
>   	atomic_t		carrier_up_count;
>   	atomic_t		carrier_down_count;
>   
> +#ifdef CONFIG_XDP_SOCKETS
> +	u16			direct_xsk_count;

Why u16? num_rx/tx_queues are unsigned ints.

> +#endif
> +
>   #ifdef CONFIG_WIRELESS_EXT
>   	const struct iw_handler_def *wireless_handlers;
>   	struct iw_public_data	*wireless_data;
> @@ -3712,6 +3717,11 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
>   bool is_skb_forwardable(const struct net_device *dev,
>   			const struct sk_buff *skb);
>   
> +#ifdef CONFIG_XDP_SOCKETS
> +int dev_set_direct_xsk_prog(struct net_device *dev);
> +int dev_clear_direct_xsk_prog(struct net_device *dev);
> +#endif
> +
>   static __always_inline int ____dev_forward_skb(struct net_device *dev,
>   					       struct sk_buff *skb)
>   {
> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
> index c9398ce7960f..9158233d34e1 100644
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
> @@ -76,6 +76,9 @@ struct xsk_map_node {
>   	struct xdp_sock **map_entry;
>   };
>   
> +/* Flags for the xdp_sock flags field. */
> +#define XDP_SOCK_DIRECT (1 << 0)
> +
>   struct xdp_sock {
>   	/* struct sock must be the first member of struct xdp_sock */
>   	struct sock sk;
> @@ -104,6 +107,7 @@ struct xdp_sock {
>   	struct list_head map_list;
>   	/* Protects map_list */
>   	spinlock_t map_list_lock;
> +	u16 flags;

Right now only the XDP_DIRECT is tracked here. Maybe track all flags, 
and show them in xsk_diag.

>   };
>   
>   struct xdp_buff;
> @@ -129,6 +133,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
>   void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
>   void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
>   bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
> +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id);
>   
>   void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
>   			     struct xdp_sock **map_entry);
> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
> index be328c59389d..d202b5d40859 100644
> --- a/include/uapi/linux/if_xdp.h
> +++ b/include/uapi/linux/if_xdp.h
> @@ -25,6 +25,11 @@
>    * application.
>    */
>   #define XDP_USE_NEED_WAKEUP (1 << 3)
> +/* This option allows an AF_XDP socket bound to a queue to receive all
> + * the packets directly from that queue when there is no XDP program
> + * attached to the device.
> + */
> +#define XDP_DIRECT	(1 << 4)
>   
>   /* Flags for xsk_umem_config flags */
>   #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 205f95af67d2..871d738a78d2 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -1349,13 +1349,14 @@ static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
>   
>   void bpf_prog_put(struct bpf_prog *prog)
>   {
> -	__bpf_prog_put(prog, true);
> +	if (!bpf_is_direct_xsk_prog(prog))
> +		__bpf_prog_put(prog, true);
>   }
>   EXPORT_SYMBOL_GPL(bpf_prog_put);
>   
>   u32 bpf_get_prog_id(const struct bpf_prog *prog)
>   {
> -	if (prog)
> +	if (prog && !bpf_is_direct_xsk_prog(prog))
>   		return prog->aux->id;
>   
>   	return 0;
> @@ -1364,7 +1365,7 @@ EXPORT_SYMBOL(bpf_get_prog_id);
>   
>   void bpf_set_prog_id(struct bpf_prog *prog, u32 id)
>   {
> -	if (prog)
> +	if (prog && !bpf_is_direct_xsk_prog(prog))
>   		prog->aux->id = id;
>   }
>   EXPORT_SYMBOL(bpf_set_prog_id);
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 866d0ad936a5..eb3cd718e580 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -8269,6 +8269,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>   	} else {
>   		if (!__dev_xdp_query(dev, bpf_op, query))
>   			return 0;
> +#ifdef CONFIG_XDP_SOCKETS
> +		if (dev->direct_xsk_count)
> +			prog = (void *)BPF_PROG_DIRECT_XSK;
> +#endif

Nit, but maybe hide this weirdness in a function?

>   	}
>   
>   	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
> @@ -8278,6 +8282,52 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
>   	return err;
>   }
>   
> +#ifdef CONFIG_XDP_SOCKETS
> +int dev_set_direct_xsk_prog(struct net_device *dev)
> +{
> +	const struct net_device_ops *ops = dev->netdev_ops;
> +	struct bpf_prog *prog;
> +	bpf_op_t bpf_op;
> +
> +	ASSERT_RTNL();
> +
> +	dev->direct_xsk_count++;
> +
> +	bpf_op = ops->ndo_bpf;
> +	if (!bpf_op)
> +		return -EOPNOTSUPP;
> +
> +	if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
> +		return 0;
> +
> +	prog = (void *)BPF_PROG_DIRECT_XSK;
> +
> +	return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog);
> +}
> +
> +int dev_clear_direct_xsk_prog(struct net_device *dev)
> +{
> +	const struct net_device_ops *ops = dev->netdev_ops;
> +	bpf_op_t bpf_op;
> +
> +	ASSERT_RTNL();
> +
> +	dev->direct_xsk_count--;
> +
> +	if (dev->direct_xsk_count)
> +		return 0;
> +
> +	bpf_op = ops->ndo_bpf;
> +	if (!bpf_op)
> +		return -EOPNOTSUPP;
> +
> +	if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
> +		return 0;
> +
> +	return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL);
> +}
> +#endif
> +
>   /**
>    *	dev_new_index	-	allocate an ifindex
>    *	@net: the applicable net namespace
> diff --git a/net/core/filter.c b/net/core/filter.c
> index ed6563622ce3..391d7d600284 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -73,6 +73,7 @@
>   #include <net/lwtunnel.h>
>   #include <net/ipv6_stubs.h>
>   #include <net/bpf_sk_storage.h>
> +#include <linux/static_key.h>
>   
>   /**
>    *	sk_filter_trim_cap - run a packet through a socket filter
> @@ -3546,6 +3547,22 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
>   	return 0;
>   }
>   
> +#ifdef CONFIG_XDP_SOCKETS
> +static void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
> +{
> +	struct xdp_sock *xsk = READ_ONCE(ri->xsk);

Why READ_ONCE here?

> +
> +	if (xsk) {
> +		ri->xsk = NULL;
> +		xsk_flush(xsk);
> +	}
> +}
> +#else
> +static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
> +{
> +}
> +#endif
> +

Move CONFIG_XDP_SOCKETS into the function, and remove the empty/bottom one.

>   void xdp_do_flush_map(void)
>   {
>   	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
> @@ -3568,6 +3585,8 @@ void xdp_do_flush_map(void)
>   			break;
>   		}
>   	}
> +
> +	xdp_do_flush_xsk(ri);
>   }
>   EXPORT_SYMBOL_GPL(xdp_do_flush_map);
>   
> @@ -3631,11 +3650,28 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
>   	return err;
>   }
>   
> +#ifdef CONFIG_XDP_SOCKETS
> +static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri)
> +{
> +	return READ_ONCE(ri->xsk);

Again, why READ_ONCE? Please leave the inlining to the compiler in .c files.

> +}
> +#else
> +static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri)
> +{
> +	return NULL;
> +}
> +#endif
> +
>   int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
>   		    struct bpf_prog *xdp_prog)
>   {
>   	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>   	struct bpf_map *map = READ_ONCE(ri->map);
> +	struct xdp_sock *xsk;
> +
> +	xsk = xdp_get_direct_xsk(ri);
> +	if (xsk)
> +		return xsk_rcv(xsk, xdp);

Hmm, maybe you need a xsk_to_flush as well. Say that a user swaps in a
regular XDP program, then xsk_rcv() will be called until the flush
occurs, right? IOW, all packets processed (napi budget) in the napi_poll
will end up in the socket.

>   
>   	if (likely(map))
>   		return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
> @@ -8934,4 +8970,26 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
>   
>   const struct bpf_prog_ops sk_reuseport_prog_ops = {
>   };
> +
> +#ifdef CONFIG_XDP_SOCKETS
> +DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
> +EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed);
> +
> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
> +{
> +	struct xdp_sock *xsk;
> +
> +	xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
> +	if (xsk) {
> +		struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
> +
> +		ri->xsk = xsk;
> +		return XDP_REDIRECT;

 From the comment above. I *think* you need to ri->xsk_to_flush. Can the
direct socket (swap socket) change before flush?

> +	}
> +
> +	return XDP_PASS;
> +}
> +EXPORT_SYMBOL(bpf_direct_xsk);
> +#endif
> +
>   #endif /* CONFIG_INET */
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index fa8fbb8fa3c8..8a29939bac7e 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -24,6 +24,7 @@
>   #include <linux/rculist.h>
>   #include <net/xdp_sock.h>
>   #include <net/xdp.h>
> +#include <linux/if_link.h>
>   
>   #include "xsk_queue.h"
>   #include "xdp_umem.h"
> @@ -264,6 +265,45 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
>   	return err;
>   }
>   
> +static void xdp_clear_direct_xsk(struct xdp_sock *xsk)

Use xs, and not xsk for consistency.

> +{
> +	struct net_device *dev = xsk->dev;
> +	u32 qid = xsk->queue_id;
> +
> +	if (!dev->_rx[qid].xsk)
> +		return;
> +
> +	dev_clear_direct_xsk_prog(dev);
> +	dev->_rx[qid].xsk = NULL;
> +	static_branch_dec(&xdp_direct_xsk_needed);
> +	xsk->flags &= ~XDP_SOCK_DIRECT;
> +}
> +
> +static int xdp_set_direct_xsk(struct xdp_sock *xsk)

Same here.

> +{
> +	struct net_device *dev = xsk->dev;
> +	u32 qid = xsk->queue_id;
> +	int err = 0;
> +
> +	if (dev->_rx[qid].xsk)
> +		return -EEXIST;
> +
> +	xsk->flags |= XDP_SOCK_DIRECT;
> +	static_branch_inc(&xdp_direct_xsk_needed);
> +	dev->_rx[qid].xsk = xsk;
> +	err = dev_set_direct_xsk_prog(dev);
> +	if (err)
> +		xdp_clear_direct_xsk(xsk);
> +
> +	return err;
> +}
> +
> +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id)
> +{
> +	return dev->_rx[queue_id].xsk;
> +}
> +EXPORT_SYMBOL(xdp_get_xsk_from_qid);
> +
>   void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
>   {
>   	xskq_produce_flush_addr_n(umem->cq, nb_entries);
> @@ -464,6 +504,11 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
>   		return;
>   	WRITE_ONCE(xs->state, XSK_UNBOUND);
>   
> +	if (xs->flags & XDP_SOCK_DIRECT) {
> +		rtnl_lock();
> +		xdp_clear_direct_xsk(xs);
> +		rtnl_unlock();
> +	}
>   	/* Wait for driver to stop using the xdp socket. */
>   	xdp_del_sk_umem(xs->umem, xs);
>   	xs->dev = NULL;
> @@ -604,7 +649,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>   
>   	flags = sxdp->sxdp_flags;
>   	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
> -		      XDP_USE_NEED_WAKEUP))
> +		      XDP_USE_NEED_WAKEUP | XDP_DIRECT))
>   		return -EINVAL;
>   
>   	rtnl_lock();
> @@ -632,7 +677,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>   		struct socket *sock;
>   
>   		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
> -		    (flags & XDP_USE_NEED_WAKEUP)) {
> +		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) {
>   			/* Cannot specify flags for shared sockets. */
>   			err = -EINVAL;
>   			goto out_unlock;
> @@ -688,6 +733,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
>   	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
>   	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
>   	xdp_add_sk_umem(xs->umem, xs);
> +	if (flags & XDP_DIRECT)
> +		err = xdp_set_direct_xsk(xs);
>   
>   out_unlock:
>   	if (err) {
> diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
> index be328c59389d..d202b5d40859 100644
> --- a/tools/include/uapi/linux/if_xdp.h
> +++ b/tools/include/uapi/linux/if_xdp.h
> @@ -25,6 +25,11 @@
>    * application.
>    */
>   #define XDP_USE_NEED_WAKEUP (1 << 3)
> +/* This option allows an AF_XDP socket bound to a queue to receive all
> + * the packets directly from that queue when there is no XDP program
> + * attached to the device.
> + */
> +#define XDP_DIRECT	(1 << 4)
>   
>   /* Flags for xsk_umem_config flags */
>   #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
>
Björn Töpel Oct. 8, 2019, 8:47 a.m. UTC | #3
On Tue, 8 Oct 2019 at 08:59, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Sridhar Samudrala <sridhar.samudrala@intel.com> writes:
>
> >  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
> >                   struct bpf_prog *xdp_prog)
> >  {
> >       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
> >       struct bpf_map *map = READ_ONCE(ri->map);
> > +     struct xdp_sock *xsk;
> > +
> > +     xsk = xdp_get_direct_xsk(ri);
> > +     if (xsk)
> > +             return xsk_rcv(xsk, xdp);
>
> This is a new branch and a read barrier in the XDP_REDIRECT fast path.
> What's the performance impact of that for non-XSK redirect?
>

The dependent-read-barrier in READ_ONCE? Another branch -- leave that
to the branch-predictor already! ;-) No, you're right, performance
impact here is interesting. I guess the same static_branch could be
used here as well...


> -Toke
>
Björn Töpel Oct. 8, 2019, 8:48 a.m. UTC | #4
On Tue, 8 Oct 2019 at 10:47, Björn Töpel <bjorn.topel@gmail.com> wrote:
>
[...]
>
> The dependent-read-barrier in READ_ONCE? Another branch -- leave that
> to the branch-predictor already! ;-) No, you're right, performance
> impact here is interesting. I guess the same static_branch could be
> used here as well...
>

...and I think the READ_ONCE can be omitted.
Toke Høiland-Jørgensen Oct. 8, 2019, 9:04 a.m. UTC | #5
Björn Töpel <bjorn.topel@gmail.com> writes:

> On Tue, 8 Oct 2019 at 08:59, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Sridhar Samudrala <sridhar.samudrala@intel.com> writes:
>>
>> >  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
>> >                   struct bpf_prog *xdp_prog)
>> >  {
>> >       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>> >       struct bpf_map *map = READ_ONCE(ri->map);
>> > +     struct xdp_sock *xsk;
>> > +
>> > +     xsk = xdp_get_direct_xsk(ri);
>> > +     if (xsk)
>> > +             return xsk_rcv(xsk, xdp);
>>
>> This is a new branch and a read barrier in the XDP_REDIRECT fast path.
>> What's the performance impact of that for non-XSK redirect?
>>
>
> The dependent-read-barrier in READ_ONCE? Another branch -- leave that
> to the branch-predictor already! ;-) No, you're right, performance
> impact here is interesting. I guess the same static_branch could be
> used here as well...

In any case, some measurements to see the impact (or lack thereof) would
be useful :)

-Toke
Alexei Starovoitov Oct. 9, 2019, 1:20 a.m. UTC | #6
On Mon, Oct 7, 2019 at 11:18 PM Sridhar Samudrala
<sridhar.samudrala@intel.com> wrote:
> +
> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
> +{
> +       struct xdp_sock *xsk;
> +
> +       xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
> +       if (xsk) {
> +               struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
> +
> +               ri->xsk = xsk;
> +               return XDP_REDIRECT;
> +       }
> +
> +       return XDP_PASS;
> +}
> +EXPORT_SYMBOL(bpf_direct_xsk);

So you're saying there is a:
"""
xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core)
   default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1
   direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1
6.1x improvement in drop rate
"""

6.1x gain running above C code vs exactly equivalent BPF code?
How is that possible?
Samudrala, Sridhar Oct. 9, 2019, 4:32 p.m. UTC | #7
On 10/8/2019 1:05 AM, Björn Töpel wrote:
> On 2019-10-08 08:16, Sridhar Samudrala wrote:
>> Introduce a flag that can be specified during the bind() call
>> of an AF_XDP socket to receive packets directly from a queue when 
>> there is
>> no XDP program attached to the device.
>>
>> This is enabled by introducing a special BPF prog pointer called
>> BPF_PROG_DIRECT_XSK and a new bind flag XDP_DIRECT that can be specified
>> when binding an AF_XDP socket to a queue. At the time of bind(), an 
>> AF_XDP
>> socket in XDP_DIRECT mode, will attach BPF_PROG_DIRECT_XSK as a bpf 
>> program
>> if there is no other XDP program attached to the device. The device 
>> receive
>> queue is also associated with the AF_XDP socket.
>>
>> In the XDP receive path, if the bpf program is a DIRECT_XSK program, the
>> XDP buffer is passed to the AF_XDP socket that is associated with the
>> receive queue on which the packet is received.
>>
>> To avoid any overhead for nomal XDP programs, a static key is used to 
>> keep
>> a count of AF_XDP direct xsk sockets and static_branch_unlikely() is used
>> to handle receives for direct XDP sockets.
>>
>> Any attach of a normal XDP program will take precedence and the direct 
>> xsk
>> program will be removed. The direct XSK program will be attached
>> automatically when the normal XDP program is removed when there are any
>> AF_XDP direct sockets associated with that device.
>>
>> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
>> ---
>>   include/linux/filter.h            | 18 ++++++++++++
>>   include/linux/netdevice.h         | 10 +++++++
>>   include/net/xdp_sock.h            |  5 ++++
>>   include/uapi/linux/if_xdp.h       |  5 ++++
>>   kernel/bpf/syscall.c              |  7 +++--
>>   net/core/dev.c                    | 50 
>> +++++++++++++++++++++++++++++++++
>>   net/core/filter.c                 | 58 
>> +++++++++++++++++++++++++++++++++++++++
>>   net/xdp/xsk.c                     | 51 
>> ++++++++++++++++++++++++++++++++--
>>   tools/include/uapi/linux/if_xdp.h |  5 ++++
>>   9 files changed, 204 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 2ce57645f3cd..db4ad85d8321 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -585,6 +585,9 @@ struct bpf_redirect_info {
>>       struct bpf_map *map;
>>       struct bpf_map *map_to_flush;
>>       u32 kern_flags;
>> +#ifdef CONFIG_XDP_SOCKETS
>> +    struct xdp_sock *xsk;
>> +#endif
>>   };
>>   DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
>> @@ -693,6 +696,16 @@ static inline u32 bpf_prog_run_clear_cb(const 
>> struct bpf_prog *prog,
>>       return res;
>>   }
>> +#ifdef CONFIG_XDP_SOCKETS
>> +#define BPF_PROG_DIRECT_XSK    0x1
>> +#define bpf_is_direct_xsk_prog(prog) \
>> +    ((unsigned long)prog == BPF_PROG_DIRECT_XSK)
>> +DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
>> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp);
>> +#else
>> +#define bpf_is_direct_xsk_prog(prog) (false)
>> +#endif
>> +
>>   static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog 
>> *prog,
>>                           struct xdp_buff *xdp)
>>   {
>> @@ -702,6 +715,11 @@ static __always_inline u32 bpf_prog_run_xdp(const 
>> struct bpf_prog *prog,
>>        * already takes rcu_read_lock() when fetching the program, so
>>        * it's not necessary here anymore.
>>        */
>> +#ifdef CONFIG_XDP_SOCKETS
>> +    if (static_branch_unlikely(&xdp_direct_xsk_needed) &&
>> +        bpf_is_direct_xsk_prog(prog))
>> +        return bpf_direct_xsk(prog, xdp);
>> +#endif
>>       return BPF_PROG_RUN(prog, xdp);
>>   }
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 48cc71aae466..f4d0f70aa718 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -743,6 +743,7 @@ struct netdev_rx_queue {
>>       struct xdp_rxq_info        xdp_rxq;
>>   #ifdef CONFIG_XDP_SOCKETS
>>       struct xdp_umem                 *umem;
>> +    struct xdp_sock            *xsk;
>>   #endif
>>   } ____cacheline_aligned_in_smp;
>> @@ -1836,6 +1837,10 @@ struct net_device {
>>       atomic_t        carrier_up_count;
>>       atomic_t        carrier_down_count;
>> +#ifdef CONFIG_XDP_SOCKETS
>> +    u16            direct_xsk_count;
> 
> Why u16? num_rx/tx_queues are unsigned ints.

OK. Will changes to unsigned int

> 
>> +#endif
>> +
>>   #ifdef CONFIG_WIRELESS_EXT
>>       const struct iw_handler_def *wireless_handlers;
>>       struct iw_public_data    *wireless_data;
>> @@ -3712,6 +3717,11 @@ int dev_forward_skb(struct net_device *dev, 
>> struct sk_buff *skb);
>>   bool is_skb_forwardable(const struct net_device *dev,
>>               const struct sk_buff *skb);
>> +#ifdef CONFIG_XDP_SOCKETS
>> +int dev_set_direct_xsk_prog(struct net_device *dev);
>> +int dev_clear_direct_xsk_prog(struct net_device *dev);
>> +#endif
>> +
>>   static __always_inline int ____dev_forward_skb(struct net_device *dev,
>>                              struct sk_buff *skb)
>>   {
>> diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
>> index c9398ce7960f..9158233d34e1 100644
>> --- a/include/net/xdp_sock.h
>> +++ b/include/net/xdp_sock.h
>> @@ -76,6 +76,9 @@ struct xsk_map_node {
>>       struct xdp_sock **map_entry;
>>   };
>> +/* Flags for the xdp_sock flags field. */
>> +#define XDP_SOCK_DIRECT (1 << 0)
>> +
>>   struct xdp_sock {
>>       /* struct sock must be the first member of struct xdp_sock */
>>       struct sock sk;
>> @@ -104,6 +107,7 @@ struct xdp_sock {
>>       struct list_head map_list;
>>       /* Protects map_list */
>>       spinlock_t map_list_lock;
>> +    u16 flags;
> 
> Right now only the XDP_DIRECT is tracked here. Maybe track all flags, 
> and show them in xsk_diag.

I see zc as the other field that can be converted into a flag.
Do you want to include that as part of this series or can it be done later?

> 
>>   };
>>   struct xdp_buff;
>> @@ -129,6 +133,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
>>   void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
>>   void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
>>   bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
>> +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 
>> queue_id);
>>   void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
>>                    struct xdp_sock **map_entry);
>> diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
>> index be328c59389d..d202b5d40859 100644
>> --- a/include/uapi/linux/if_xdp.h
>> +++ b/include/uapi/linux/if_xdp.h
>> @@ -25,6 +25,11 @@
>>    * application.
>>    */
>>   #define XDP_USE_NEED_WAKEUP (1 << 3)
>> +/* This option allows an AF_XDP socket bound to a queue to receive all
>> + * the packets directly from that queue when there is no XDP program
>> + * attached to the device.
>> + */
>> +#define XDP_DIRECT    (1 << 4)
>>   /* Flags for xsk_umem_config flags */
>>   #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 205f95af67d2..871d738a78d2 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -1349,13 +1349,14 @@ static void __bpf_prog_put(struct bpf_prog 
>> *prog, bool do_idr_lock)
>>   void bpf_prog_put(struct bpf_prog *prog)
>>   {
>> -    __bpf_prog_put(prog, true);
>> +    if (!bpf_is_direct_xsk_prog(prog))
>> +        __bpf_prog_put(prog, true);
>>   }
>>   EXPORT_SYMBOL_GPL(bpf_prog_put);
>>   u32 bpf_get_prog_id(const struct bpf_prog *prog)
>>   {
>> -    if (prog)
>> +    if (prog && !bpf_is_direct_xsk_prog(prog))
>>           return prog->aux->id;
>>       return 0;
>> @@ -1364,7 +1365,7 @@ EXPORT_SYMBOL(bpf_get_prog_id);
>>   void bpf_set_prog_id(struct bpf_prog *prog, u32 id)
>>   {
>> -    if (prog)
>> +    if (prog && !bpf_is_direct_xsk_prog(prog))
>>           prog->aux->id = id;
>>   }
>>   EXPORT_SYMBOL(bpf_set_prog_id);
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index 866d0ad936a5..eb3cd718e580 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -8269,6 +8269,10 @@ int dev_change_xdp_fd(struct net_device *dev, 
>> struct netlink_ext_ack *extack,
>>       } else {
>>           if (!__dev_xdp_query(dev, bpf_op, query))
>>               return 0;
>> +#ifdef CONFIG_XDP_SOCKETS
>> +        if (dev->direct_xsk_count)
>> +            prog = (void *)BPF_PROG_DIRECT_XSK;
>> +#endif
> 
> Nit, but maybe hide this weirdness in a function?

OK.

> 
>>       }
>>       err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
>> @@ -8278,6 +8282,52 @@ int dev_change_xdp_fd(struct net_device *dev, 
>> struct netlink_ext_ack *extack,
>>       return err;
>>   }
>> +#ifdef CONFIG_XDP_SOCKETS
>> +int dev_set_direct_xsk_prog(struct net_device *dev)
>> +{
>> +    const struct net_device_ops *ops = dev->netdev_ops;
>> +    struct bpf_prog *prog;
>> +    bpf_op_t bpf_op;
>> +
>> +    ASSERT_RTNL();
>> +
>> +    dev->direct_xsk_count++;
>> +
>> +    bpf_op = ops->ndo_bpf;
>> +    if (!bpf_op)
>> +        return -EOPNOTSUPP;
>> +
>> +    if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
>> +        return 0;
>> +
>> +    prog = (void *)BPF_PROG_DIRECT_XSK;
>> +
>> +    return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog);
>> +}
>> +
>> +int dev_clear_direct_xsk_prog(struct net_device *dev)
>> +{
>> +    const struct net_device_ops *ops = dev->netdev_ops;
>> +    bpf_op_t bpf_op;
>> +
>> +    ASSERT_RTNL();
>> +
>> +    dev->direct_xsk_count--;
>> +
>> +    if (dev->direct_xsk_count)
>> +        return 0;
>> +
>> +    bpf_op = ops->ndo_bpf;
>> +    if (!bpf_op)
>> +        return -EOPNOTSUPP;
>> +
>> +    if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
>> +        return 0;
>> +
>> +    return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL);
>> +}
>> +#endif
>> +
>>   /**
>>    *    dev_new_index    -    allocate an ifindex
>>    *    @net: the applicable net namespace
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index ed6563622ce3..391d7d600284 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -73,6 +73,7 @@
>>   #include <net/lwtunnel.h>
>>   #include <net/ipv6_stubs.h>
>>   #include <net/bpf_sk_storage.h>
>> +#include <linux/static_key.h>
>>   /**
>>    *    sk_filter_trim_cap - run a packet through a socket filter
>> @@ -3546,6 +3547,22 @@ static int __bpf_tx_xdp_map(struct net_device 
>> *dev_rx, void *fwd,
>>       return 0;
>>   }
>> +#ifdef CONFIG_XDP_SOCKETS
>> +static void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
>> +{
>> +    struct xdp_sock *xsk = READ_ONCE(ri->xsk);
> 
> Why READ_ONCE here?
> 
>> +
>> +    if (xsk) {
>> +        ri->xsk = NULL;
>> +        xsk_flush(xsk);
>> +    }
>> +}
>> +#else
>> +static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
>> +{
>> +}
>> +#endif
>> +
> 
> Move CONFIG_XDP_SOCKETS into the function, and remove the empty/bottom one.
> 
>>   void xdp_do_flush_map(void)
>>   {
>>       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>> @@ -3568,6 +3585,8 @@ void xdp_do_flush_map(void)
>>               break;
>>           }
>>       }
>> +
>> +    xdp_do_flush_xsk(ri);
>>   }
>>   EXPORT_SYMBOL_GPL(xdp_do_flush_map);
>> @@ -3631,11 +3650,28 @@ static int xdp_do_redirect_map(struct 
>> net_device *dev, struct xdp_buff *xdp,
>>       return err;
>>   }
>> +#ifdef CONFIG_XDP_SOCKETS
>> +static inline struct xdp_sock *xdp_get_direct_xsk(struct 
>> bpf_redirect_info *ri)
>> +{
>> +    return READ_ONCE(ri->xsk);
> 
> Again, why READ_ONCE? Please leave the inlining to the compiler in .c 
> files.
> 
>> +}
>> +#else
>> +static inline struct xdp_sock *xdp_get_direct_xsk(struct 
>> bpf_redirect_info *ri)
>> +{
>> +    return NULL;
>> +}
>> +#endif
>> +
>>   int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
>>               struct bpf_prog *xdp_prog)
>>   {
>>       struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>>       struct bpf_map *map = READ_ONCE(ri->map);
>> +    struct xdp_sock *xsk;
>> +
>> +    xsk = xdp_get_direct_xsk(ri);
>> +    if (xsk)
>> +        return xsk_rcv(xsk, xdp);
> 
> Hmm, maybe you need a xsk_to_flush as well. Say that a user swaps in a
> regular XDP program, then xsk_rcv() will be called until the flush
> occurs, right? IOW, all packets processed (napi budget) in the napi_poll
> will end up in the socket.

I originally had xsk_to_flush considering this possibility of the XDP 
program changing before call to flush.
Will re-introduce it in the next revision.
Should i create a separate structure bpf_direct_xsk_info rather than 
adding these fields to bpf_redirect_info?


> 
>>       if (likely(map))
>>           return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
>> @@ -8934,4 +8970,26 @@ const struct bpf_verifier_ops 
>> sk_reuseport_verifier_ops = {
>>   const struct bpf_prog_ops sk_reuseport_prog_ops = {
>>   };
>> +
>> +#ifdef CONFIG_XDP_SOCKETS
>> +DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
>> +EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed);
>> +
>> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
>> +{
>> +    struct xdp_sock *xsk;
>> +
>> +    xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
>> +    if (xsk) {
>> +        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
>> +
>> +        ri->xsk = xsk;
>> +        return XDP_REDIRECT;
> 
>  From the comment above. I *think* you need to ri->xsk_to_flush. Can the
> direct socket (swap socket) change before flush?

Yes.

> 
>> +    }
>> +
>> +    return XDP_PASS;
>> +}
>> +EXPORT_SYMBOL(bpf_direct_xsk);
>> +#endif
>> +
>>   #endif /* CONFIG_INET */
>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>> index fa8fbb8fa3c8..8a29939bac7e 100644
>> --- a/net/xdp/xsk.c
>> +++ b/net/xdp/xsk.c
>> @@ -24,6 +24,7 @@
>>   #include <linux/rculist.h>
>>   #include <net/xdp_sock.h>
>>   #include <net/xdp.h>
>> +#include <linux/if_link.h>
>>   #include "xsk_queue.h"
>>   #include "xdp_umem.h"
>> @@ -264,6 +265,45 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct 
>> xdp_buff *xdp)
>>       return err;
>>   }
>> +static void xdp_clear_direct_xsk(struct xdp_sock *xsk)
> 
> Use xs, and not xsk for consistency.

OK.

> 
>> +{
>> +    struct net_device *dev = xsk->dev;
>> +    u32 qid = xsk->queue_id;
>> +
>> +    if (!dev->_rx[qid].xsk)
>> +        return;
>> +
>> +    dev_clear_direct_xsk_prog(dev);
>> +    dev->_rx[qid].xsk = NULL;
>> +    static_branch_dec(&xdp_direct_xsk_needed);
>> +    xsk->flags &= ~XDP_SOCK_DIRECT;
>> +}
>> +
>> +static int xdp_set_direct_xsk(struct xdp_sock *xsk)
> 
> Same here.
> 
>> +{
>> +    struct net_device *dev = xsk->dev;
>> +    u32 qid = xsk->queue_id;
>> +    int err = 0;
>> +
>> +    if (dev->_rx[qid].xsk)
>> +        return -EEXIST;
>> +
>> +    xsk->flags |= XDP_SOCK_DIRECT;
>> +    static_branch_inc(&xdp_direct_xsk_needed);
>> +    dev->_rx[qid].xsk = xsk;
>> +    err = dev_set_direct_xsk_prog(dev);
>> +    if (err)
>> +        xdp_clear_direct_xsk(xsk);
>> +
>> +    return err;
>> +}
>> +
>> +struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 
>> queue_id)
>> +{
>> +    return dev->_rx[queue_id].xsk;
>> +}
>> +EXPORT_SYMBOL(xdp_get_xsk_from_qid);
>> +
>>   void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
>>   {
>>       xskq_produce_flush_addr_n(umem->cq, nb_entries);
>> @@ -464,6 +504,11 @@ static void xsk_unbind_dev(struct xdp_sock *xs)
>>           return;
>>       WRITE_ONCE(xs->state, XSK_UNBOUND);
>> +    if (xs->flags & XDP_SOCK_DIRECT) {
>> +        rtnl_lock();
>> +        xdp_clear_direct_xsk(xs);
>> +        rtnl_unlock();
>> +    }
>>       /* Wait for driver to stop using the xdp socket. */
>>       xdp_del_sk_umem(xs->umem, xs);
>>       xs->dev = NULL;
>> @@ -604,7 +649,7 @@ static int xsk_bind(struct socket *sock, struct 
>> sockaddr *addr, int addr_len)
>>       flags = sxdp->sxdp_flags;
>>       if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
>> -              XDP_USE_NEED_WAKEUP))
>> +              XDP_USE_NEED_WAKEUP | XDP_DIRECT))
>>           return -EINVAL;
>>       rtnl_lock();
>> @@ -632,7 +677,7 @@ static int xsk_bind(struct socket *sock, struct 
>> sockaddr *addr, int addr_len)
>>           struct socket *sock;
>>           if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
>> -            (flags & XDP_USE_NEED_WAKEUP)) {
>> +            (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) {
>>               /* Cannot specify flags for shared sockets. */
>>               err = -EINVAL;
>>               goto out_unlock;
>> @@ -688,6 +733,8 @@ static int xsk_bind(struct socket *sock, struct 
>> sockaddr *addr, int addr_len)
>>       xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
>>       xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
>>       xdp_add_sk_umem(xs->umem, xs);
>> +    if (flags & XDP_DIRECT)
>> +        err = xdp_set_direct_xsk(xs);
>>   out_unlock:
>>       if (err) {
>> diff --git a/tools/include/uapi/linux/if_xdp.h 
>> b/tools/include/uapi/linux/if_xdp.h
>> index be328c59389d..d202b5d40859 100644
>> --- a/tools/include/uapi/linux/if_xdp.h
>> +++ b/tools/include/uapi/linux/if_xdp.h
>> @@ -25,6 +25,11 @@
>>    * application.
>>    */
>>   #define XDP_USE_NEED_WAKEUP (1 << 3)
>> +/* This option allows an AF_XDP socket bound to a queue to receive all
>> + * the packets directly from that queue when there is no XDP program
>> + * attached to the device.
>> + */
>> +#define XDP_DIRECT    (1 << 4)
>>   /* Flags for xsk_umem_config flags */
>>   #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
>>
Samudrala, Sridhar Oct. 9, 2019, 4:53 p.m. UTC | #8
>> +
>> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
>> +{
>> +       struct xdp_sock *xsk;
>> +
>> +       xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
>> +       if (xsk) {
>> +               struct bpf_redirect_info *ri =
>> + this_cpu_ptr(&bpf_redirect_info);
>> +
>> +               ri->xsk = xsk;
>> +               return XDP_REDIRECT;
>> +       }
>> +
>> +       return XDP_PASS;
>> +}
>> +EXPORT_SYMBOL(bpf_direct_xsk);
> 
> So you're saying there is a:
> """
> xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core)
>     default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1
>     direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """
> 
> 6.1x gain running above C code vs exactly equivalent BPF code?
> How is that possible?

It seems to be due to the overhead of __bpf_prog_run on older processors 
(Ivybridge). The overhead is smaller on newer processors, but even on 
skylake i see around 1.5x improvement.

perf report with default xdpsock
================================
Samples: 2K of event 'cycles:ppp', Event count (approx.): 8437658090
Overhead  Command          Shared Object     Symbol
   34.57%  xdpsock          xdpsock           [.] main
   17.19%  ksoftirqd/1      [kernel.vmlinux]  [k] ___bpf_prog_run
   13.12%  xdpsock          [kernel.vmlinux]  [k] ___bpf_prog_run
    4.09%  ksoftirqd/1      [kernel.vmlinux]  [k] __x86_indirect_thunk_rax
    3.08%  xdpsock          [kernel.vmlinux]  [k] nmi
    2.76%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_map_lookup_elem
    2.33%  xdpsock          [kernel.vmlinux]  [k] __x86_indirect_thunk_rax
    2.33%  ksoftirqd/1      [i40e]            [k] i40e_clean_rx_irq_zc
    2.16%  xdpsock          [kernel.vmlinux]  [k] bpf_map_lookup_elem
    1.82%  ksoftirqd/1      [kernel.vmlinux]  [k] xdp_do_redirect
    1.41%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_rcv
    1.39%  ksoftirqd/1      [kernel.vmlinux]  [k] update_curr
    1.09%  ksoftirqd/1      [kernel.vmlinux]  [k] bpf_xdp_redirect_map
    1.09%  xdpsock          [i40e]            [k] i40e_clean_rx_irq_zc
    1.08%  ksoftirqd/1      [kernel.vmlinux]  [k] __xsk_map_redirect
    1.07%  swapper          [kernel.vmlinux]  [k] xsk_umem_peek_addr
    1.05%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_umem_peek_addr
    0.89%  swapper          [kernel.vmlinux]  [k] __xsk_map_redirect
    0.87%  ksoftirqd/1      [kernel.vmlinux]  [k] __bpf_prog_run32
    0.87%  swapper          [kernel.vmlinux]  [k] intel_idle
    0.67%  xdpsock          [kernel.vmlinux]  [k] bpf_xdp_redirect_map
    0.57%  xdpsock          [kernel.vmlinux]  [k] xdp_do_redirect

perf report with direct xdpsock
===============================
Samples: 2K of event 'cycles:ppp', Event count (approx.): 17996091975
Overhead  Command          Shared Object     Symbol
   18.44%  xdpsock          [i40e]            [k] i40e_clean_rx_irq_zc
   15.14%  ksoftirqd/1      [i40e]            [k] i40e_clean_rx_irq_zc
    6.87%  xdpsock          [kernel.vmlinux]  [k] xsk_umem_peek_addr
    5.03%  ksoftirqd/1      [kernel.vmlinux]  [k] xdp_do_redirect
    4.21%  xdpsock          xdpsock           [.] main
    4.13%  ksoftirqd/1      [i40e]            [k] 
i40e_clean_programming_status
    3.71%  xdpsock          [kernel.vmlinux]  [k] xsk_rcv
    3.44%  ksoftirqd/1      [kernel.vmlinux]  [k] nmi
    3.41%  xdpsock          [kernel.vmlinux]  [k] nmi
    3.20%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_rcv
    2.45%  xdpsock          [kernel.vmlinux]  [k] xdp_get_xsk_from_qid
    2.35%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_umem_peek_addr
    2.33%  ksoftirqd/1      [kernel.vmlinux]  [k] net_rx_action
    2.16%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_umem_consume_tx
    2.10%  swapper          [kernel.vmlinux]  [k] __softirqentry_text_start
    2.06%  xdpsock          [kernel.vmlinux]  [k] native_irq_return_iret
    1.43%  xdpsock          [kernel.vmlinux]  [k] check_preempt_wakeup
    1.42%  xdpsock          [kernel.vmlinux]  [k] xsk_umem_consume_tx
    1.22%  xdpsock          [kernel.vmlinux]  [k] xdp_do_redirect
    1.21%  xdpsock          [kernel.vmlinux]  [k] 
dma_direct_sync_single_for_device
    1.16%  ksoftirqd/1      [kernel.vmlinux]  [k] irqtime_account_irq
    1.09%  xdpsock          [kernel.vmlinux]  [k] sock_def_readable
    0.99%  swapper          [kernel.vmlinux]  [k] intel_idle
    0.88%  xdpsock          [i40e]            [k] 
i40e_clean_programming_status
    0.74%  ksoftirqd/1      [kernel.vmlinux]  [k] xsk_umem_discard_addr
    0.71%  ksoftirqd/1      [kernel.vmlinux]  [k] __switch_to
    0.50%  ksoftirqd/1      [kernel.vmlinux]  [k] 
dma_direct_sync_single_for_device
Alexei Starovoitov Oct. 9, 2019, 5:17 p.m. UTC | #9
On Wed, Oct 9, 2019 at 9:53 AM Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
>
> >> +
> >> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
> >> +{
> >> +       struct xdp_sock *xsk;
> >> +
> >> +       xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
> >> +       if (xsk) {
> >> +               struct bpf_redirect_info *ri =
> >> + this_cpu_ptr(&bpf_redirect_info);
> >> +
> >> +               ri->xsk = xsk;
> >> +               return XDP_REDIRECT;
> >> +       }
> >> +
> >> +       return XDP_PASS;
> >> +}
> >> +EXPORT_SYMBOL(bpf_direct_xsk);
> >
> > So you're saying there is a:
> > """
> > xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core)
> >     default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1
> >     direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """
> >
> > 6.1x gain running above C code vs exactly equivalent BPF code?
> > How is that possible?
>
> It seems to be due to the overhead of __bpf_prog_run on older processors
> (Ivybridge). The overhead is smaller on newer processors, but even on
> skylake i see around 1.5x improvement.
>
> perf report with default xdpsock
> ================================
> Samples: 2K of event 'cycles:ppp', Event count (approx.): 8437658090
> Overhead  Command          Shared Object     Symbol
>    34.57%  xdpsock          xdpsock           [.] main
>    17.19%  ksoftirqd/1      [kernel.vmlinux]  [k] ___bpf_prog_run
>    13.12%  xdpsock          [kernel.vmlinux]  [k] ___bpf_prog_run

That must be a bad joke.
The whole patch set is based on comparing native code to interpreter?!
It's pretty awesome that interpreter is only 1.5x slower than native x86.
Just turn the JIT on.

Obvious Nack to the patch set.
Samudrala, Sridhar Oct. 9, 2019, 7:12 p.m. UTC | #10
On 10/9/2019 10:17 AM, Alexei Starovoitov wrote:
> On Wed, Oct 9, 2019 at 9:53 AM Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>>
>>
>>>> +
>>>> +u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
>>>> +{
>>>> +       struct xdp_sock *xsk;
>>>> +
>>>> +       xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
>>>> +       if (xsk) {
>>>> +               struct bpf_redirect_info *ri =
>>>> + this_cpu_ptr(&bpf_redirect_info);
>>>> +
>>>> +               ri->xsk = xsk;
>>>> +               return XDP_REDIRECT;
>>>> +       }
>>>> +
>>>> +       return XDP_PASS;
>>>> +}
>>>> +EXPORT_SYMBOL(bpf_direct_xsk);
>>>
>>> So you're saying there is a:
>>> """
>>> xdpsock rxdrop 1 core (both app and queue's irq pinned to the same core)
>>>      default : taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1
>>>      direct-xsk :taskset -c 1 ./xdpsock -i enp66s0f0 -r -q 1 6.1x improvement in drop rate """
>>>
>>> 6.1x gain running above C code vs exactly equivalent BPF code?
>>> How is that possible?
>>
>> It seems to be due to the overhead of __bpf_prog_run on older processors
>> (Ivybridge). The overhead is smaller on newer processors, but even on
>> skylake i see around 1.5x improvement.
>>
>> perf report with default xdpsock
>> ================================
>> Samples: 2K of event 'cycles:ppp', Event count (approx.): 8437658090
>> Overhead  Command          Shared Object     Symbol
>>     34.57%  xdpsock          xdpsock           [.] main
>>     17.19%  ksoftirqd/1      [kernel.vmlinux]  [k] ___bpf_prog_run
>>     13.12%  xdpsock          [kernel.vmlinux]  [k] ___bpf_prog_run
> 
> That must be a bad joke.
> The whole patch set is based on comparing native code to interpreter?!
> It's pretty awesome that interpreter is only 1.5x slower than native x86.
> Just turn the JIT on.

Thanks Alexei for pointing out that i didn't have JIT on.
When i turn it on, the performance improvement is a more modest 1.5x 
with rxdrop and 1.2x with l2fwd.

> 
> Obvious Nack to the patch set.
> 

Will update the patchset with the right performance data and address 
feedback from Bjorn.
Hope you are not totally against direct XDP approach as it does provide 
value when an AF_XDP socket is bound to a queue and a HW filter can 
direct packets targeted for that queue.
Alexei Starovoitov Oct. 10, 2019, 1:06 a.m. UTC | #11
On Wed, Oct 9, 2019 at 12:12 PM Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
> >>     34.57%  xdpsock          xdpsock           [.] main
> >>     17.19%  ksoftirqd/1      [kernel.vmlinux]  [k] ___bpf_prog_run
> >>     13.12%  xdpsock          [kernel.vmlinux]  [k] ___bpf_prog_run
> >
> > That must be a bad joke.
> > The whole patch set is based on comparing native code to interpreter?!
> > It's pretty awesome that interpreter is only 1.5x slower than native x86.
> > Just turn the JIT on.
>
> Thanks Alexei for pointing out that i didn't have JIT on.
> When i turn it on, the performance improvement is a more modest 1.5x
> with rxdrop and 1.2x with l2fwd.

I want to see perf reports back to back with proper performance analysis.

> >
> > Obvious Nack to the patch set.
> >
>
> Will update the patchset with the right performance data and address
> feedback from Bjorn.
> Hope you are not totally against direct XDP approach as it does provide
> value when an AF_XDP socket is bound to a queue and a HW filter can
> direct packets targeted for that queue.

I used to be in favor of providing "prog bypass" for af_xdp,
because of anecdotal evidence that it will make af_xdp faster.
Now seeing the numbers and the way they were collected
I'm against such bypass.
I want to see hard proof that trivial bpf prog is actually slowing things down
before reviewing any new patch sets.
Samudrala, Sridhar Oct. 18, 2019, 6:40 p.m. UTC | #12
On 10/9/2019 6:06 PM, Alexei Starovoitov wrote:
>>
>> Will update the patchset with the right performance data and address
>> feedback from Bjorn.
>> Hope you are not totally against direct XDP approach as it does provide
>> value when an AF_XDP socket is bound to a queue and a HW filter can
>> direct packets targeted for that queue.
> 
> I used to be in favor of providing "prog bypass" for af_xdp,
> because of anecdotal evidence that it will make af_xdp faster.
> Now seeing the numbers and the way they were collected
> I'm against such bypass.
> I want to see hard proof that trivial bpf prog is actually slowing things down
> before reviewing any new patch sets.
> 

Here is a more detailed performance report that compares the current AF_XDP rx_drop
with the patches that enable direct receive without any XDP program. I also collected
and included kernel rxdrop data too as Jakub requested and also perf reports.
Hope it addresses the concerns you raised with the earlier data I posted.

Test Setup
==========
2 Skylake servers with Intel 40Gbe NICs connected via 100Gb Switch

Server Configuration
====================
Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz (skylake)
CPU(s):              112
On-line CPU(s) list: 0-111
Thread(s) per core:  2
Core(s) per socket:  28
Socket(s):           2
NUMA node(s):        2
NUMA node0 CPU(s):   0-27,56-83
NUMA node1 CPU(s):   28-55,84-111

Memory: 96GB

NIC: Intel Corporation Ethernet Controller XL710 for 40GbE QSFP+ (rev 02)

Distro
======
Fedora 29 (Server Edition)

Kernel Configuration
====================
AF_XDP direct socket patches applied on top of
bpf-next git repo HEAD: 05949f63055fcf53947886ddb8e23c8a5d41bd80

# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-5.3.0-bpf-next-dxk+ root=/dev/mapper/fedora-root ro resume=/dev/mapper/fedora-swap rd.lvm.lv=fedora/root rd.lvm.lv=fedora/swap LANG=en_IN.UTF-8

For ‘mitigations OFF’ scenarios, the kernel command line parameter is changed to add ‘mitigations=off’

Packet Generator on the link partner
====================================
   pktgen - sending 64 byte UDP packets at 43mpps

HW filter to redirect packet to a queue
=======================================
ethtool -N ens260f1 flow-type udp4 dst-ip 192.168.128.41 dst-port 9 action 28

Test Cases
==========
kernel rxdrop
   taskset -c 28 samples/bpf/xdp_rxq_info -d ens260f1 -a XDP_DROP

AF_XDP default rxdrop
   taskset -c 28 samples/bpf/xdpsock -i ens260f1 -r -q 28

AD_XDP direct rxdrop
   taskset -c 28 samples/bpf/xdpsock -i ens260f1 -r -d -q 28

Performance Results
===================
Only 1 core is used in all these testcases as the app and the queue irq are pinned to the same core.

----------------------------------------------------------------------------------
                                mitigations ON                mitigations OFF
   Testcase              ----------------------------------------------------------
                         no patches    with patches       no patches   with patches
----------------------------------------------------------------------------------
AF_XDP default rxdrop        X             X                   Y            Y
AF_XDP direct rxdrop        N/A          X+46%                N/A         Y+25%
Kernel rxdrop              X+61%         X+61%               Y+53%        Y+53%
----------------------------------------------------------------------------------

Here Y is pps with CPU security mitigations turned OFF and it is 26% better than X.
So there is 46% improvement in AF_XDP rxdrop performance with direct receive when
mitigations are ON (default configuration) and 25% improvement when mitigations are
turned OFF.
As expected, the in-kernel rxdrop performance is higher even with direct receive in
both scenarios.

Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
==========================================================================
Samples: 44K of event 'cycles', Event count (approx.): 38532389541
Overhead  Command          Shared Object              Symbol
   15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
   10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
    9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
    8.62%  xdpsock          xdpsock                    [.] main
    7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
    5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
    4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
    3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
    2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
    2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
    2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
    2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
    1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
    1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
    1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
    1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
    1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
    1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
    1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
    0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
    0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
    0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
    0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
    0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
    0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
    0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu

Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
==========================================================================
Samples: 46K of event 'cycles', Event count (approx.): 38387018585
Overhead  Command          Shared Object             Symbol
   21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
   14.36%  xdpsock          xdpsock                   [.] main
   11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
   11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
    4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
    2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
    2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
    2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
    2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
    2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
    1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
    1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
    1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
    1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
    0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
    0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
    0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
    0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
    0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
    0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
    0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
    0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr

Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
	bpf_prog_xxx
         bpf_xdp_redirect_map
	xsk_map_lookup_elem
         __xsk_map_redirect
With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()

The above test results document performance of components on a particular test, in specific systems.
Differences in hardware, software, or configuration will affect actual performance.

Thanks
Sridhar
Toke Høiland-Jørgensen Oct. 18, 2019, 7:22 p.m. UTC | #13
"Samudrala, Sridhar" <sridhar.samudrala@intel.com> writes:

> Performance Results
> ===================
> Only 1 core is used in all these testcases as the app and the queue irq are pinned to the same core.
>
> ----------------------------------------------------------------------------------
>                                 mitigations ON                mitigations OFF
>    Testcase              ----------------------------------------------------------
>                          no patches    with patches       no patches   with patches
> ----------------------------------------------------------------------------------
> AF_XDP default rxdrop        X             X                   Y            Y

Is this really exactly the same with and without patches? You're adding
an extra check to xdp_do_redirect(); are you really saying that the
impact of that is zero?

> AF_XDP direct rxdrop        N/A          X+46%                N/A         Y+25%
> Kernel rxdrop              X+61%         X+61%               Y+53%        Y+53%
> ----------------------------------------------------------------------------------
>
> Here Y is pps with CPU security mitigations turned OFF and it is 26%
> better than X.

Any particular reason you're not sharing the values of X and Y?

-Toke
Alexei Starovoitov Oct. 19, 2019, 12:14 a.m. UTC | #14
On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
> 
> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
> ==========================================================================
> Samples: 44K of event 'cycles', Event count (approx.): 38532389541
> Overhead  Command          Shared Object              Symbol
>   15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>   10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>    9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>    8.62%  xdpsock          xdpsock                    [.] main
>    7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>    5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>    4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>    3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv

why everything is duplicated?
Same code runs in different tasks ?

>    2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>    2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
>    2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>    2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>    1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>    1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>    1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>    1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>    1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>    1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
>    1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>    0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>    0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>    0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>    0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>    0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
>    0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>    0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> 
> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
> ==========================================================================
> Samples: 46K of event 'cycles', Event count (approx.): 38387018585
> Overhead  Command          Shared Object             Symbol
>   21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
>   14.36%  xdpsock          xdpsock                   [.] main
>   11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
>   11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
>    4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
>    2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
>    2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
>    2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
>    2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
>    2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>    1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>    1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>    1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
>    1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
>    0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
>    0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
>    0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
>    0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
>    0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
>    0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>    0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
>    0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
> 
> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
> 	bpf_prog_xxx
>         bpf_xdp_redirect_map
> 	xsk_map_lookup_elem
>         __xsk_map_redirect
> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()

I don't think you're identifying the overhead correctly.
xsk_map_lookup_elem is 1%
but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
which is a different function:
ffffffff81493fe0 T __xsk_map_lookup_elem
ffffffff81492e80 t xsk_map_lookup_elem

10% for bpf_prog_80b55d8a76303785 is huge.
It's the actual code of the program _without_ any helpers.
How does the program actually look?
Samudrala, Sridhar Oct. 19, 2019, 12:45 a.m. UTC | #15
On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
>>
>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
>> ==========================================================================
>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541
>> Overhead  Command          Shared Object              Symbol
>>    15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>>    10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>>     9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>>     8.62%  xdpsock          xdpsock                    [.] main
>>     7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>>     5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>>     4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>>     3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
> 
> why everything is duplicated?
> Same code runs in different tasks ?

Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.

> 
>>     2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>     2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
>>     2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>>     2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>>     1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>     1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>     1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>     1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>     1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>     1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
>>     1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>>     0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>     0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>>     0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>>     0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>>     0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
>>     0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>>     0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>
>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
>> ==========================================================================
>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585
>> Overhead  Command          Shared Object             Symbol
>>    21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
>>    14.36%  xdpsock          xdpsock                   [.] main
>>    11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
>>    11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
>>     4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
>>     2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
>>     2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
>>     2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
>>     2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
>>     2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>>     1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>>     1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>>     1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
>>     1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
>>     0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
>>     0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
>>     0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
>>     0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
>>     0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
>>     0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>>     0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
>>     0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
>>
>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
>> 	bpf_prog_xxx
>>          bpf_xdp_redirect_map
>> 	xsk_map_lookup_elem
>>          __xsk_map_redirect
>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
> 
> I don't think you're identifying the overhead correctly.
> xsk_map_lookup_elem is 1%
> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
> which is a different function:
> ffffffff81493fe0 T __xsk_map_lookup_elem
> ffffffff81492e80 t xsk_map_lookup_elem
> 
> 10% for bpf_prog_80b55d8a76303785 is huge.
> It's the actual code of the program _without_ any helpers.
> How does the program actually look?

It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268

In the archives, i see that Toke had some comments, but somehow i didn't get his email in
my inbox.

> Performance Results
> ===================
> Only 1 core is used in all these testcases as the app and the queue irq are pinned to the same core.
>
> ----------------------------------------------------------------------------------
>                                 mitigations ON                mitigations OFF
>    Testcase              ----------------------------------------------------------
>                          no patches    with patches       no patches   with patches
> ----------------------------------------------------------------------------------
> AF_XDP default rxdrop        X             X                   Y            Y

> Is this really exactly the same with and without patches? You're adding
> an extra check to xdp_do_redirect(); are you really saying that the
> impact of that is zero?

Yes. I didn't see any impact. The variation is within +/- < 1%
I could use static_key even for that check in xdp_do_redirect() if required.

-Sridhar
Alexei Starovoitov Oct. 19, 2019, 2:25 a.m. UTC | #16
On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote:
> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
> > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
> > > 
> > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
> > > ==========================================================================
> > > Samples: 44K of event 'cycles', Event count (approx.): 38532389541
> > > Overhead  Command          Shared Object              Symbol
> > >    15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
> > >    10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
> > >     9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
> > >     8.62%  xdpsock          xdpsock                    [.] main
> > >     7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
> > >     5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
> > >     4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
> > >     3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
> > 
> > why everything is duplicated?
> > Same code runs in different tasks ?
> 
> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.
> 
> > 
> > >     2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> > >     2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
> > >     2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
> > >     2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
> > >     1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
> > >     1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
> > >     1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> > >     1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> > >     1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
> > >     1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
> > >     1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
> > >     0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
> > >     0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
> > >     0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
> > >     0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
> > >     0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
> > >     0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
> > >     0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> > > 
> > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
> > > ==========================================================================
> > > Samples: 46K of event 'cycles', Event count (approx.): 38387018585
> > > Overhead  Command          Shared Object             Symbol
> > >    21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
> > >    14.36%  xdpsock          xdpsock                   [.] main
> > >    11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
> > >    11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
> > >     4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
> > >     2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
> > >     2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
> > >     2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
> > >     2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
> > >     2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
> > >     1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
> > >     1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
> > >     1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
> > >     1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
> > >     0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
> > >     0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
> > >     0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
> > >     0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
> > >     0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
> > >     0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
> > >     0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
> > >     0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
> > > 
> > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
> > > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
> > > 	bpf_prog_xxx
> > >          bpf_xdp_redirect_map
> > > 	xsk_map_lookup_elem
> > >          __xsk_map_redirect
> > > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
> > 
> > I don't think you're identifying the overhead correctly.
> > xsk_map_lookup_elem is 1%
> > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
> > which is a different function:
> > ffffffff81493fe0 T __xsk_map_lookup_elem
> > ffffffff81492e80 t xsk_map_lookup_elem
> > 
> > 10% for bpf_prog_80b55d8a76303785 is huge.
> > It's the actual code of the program _without_ any helpers.
> > How does the program actually look?
> 
> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268

I see. Looks like map_gen_lookup was never implemented for xskmap.
How about adding it first the way array_map_gen_lookup() is implemented?
This will easily give 2x perf gain.
Toke Høiland-Jørgensen Oct. 20, 2019, 10:14 a.m. UTC | #17
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote:
>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
>> > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
>> > > 
>> > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
>> > > ==========================================================================
>> > > Samples: 44K of event 'cycles', Event count (approx.): 38532389541
>> > > Overhead  Command          Shared Object              Symbol
>> > >    15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>> > >    10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>> > >     9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>> > >     8.62%  xdpsock          xdpsock                    [.] main
>> > >     7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>> > >     5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>> > >     4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>> > >     3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
>> > 
>> > why everything is duplicated?
>> > Same code runs in different tasks ?
>> 
>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.
>> 
>> > 
>> > >     2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>> > >     2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
>> > >     2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>> > >     2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>> > >     1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>> > >     1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>> > >     1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>> > >     1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>> > >     1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>> > >     1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
>> > >     1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>> > >     0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>> > >     0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>> > >     0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>> > >     0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>> > >     0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
>> > >     0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>> > >     0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>> > > 
>> > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
>> > > ==========================================================================
>> > > Samples: 46K of event 'cycles', Event count (approx.): 38387018585
>> > > Overhead  Command          Shared Object             Symbol
>> > >    21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
>> > >    14.36%  xdpsock          xdpsock                   [.] main
>> > >    11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
>> > >    11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
>> > >     4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
>> > >     2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
>> > >     2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
>> > >     2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
>> > >     2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
>> > >     2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>> > >     1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>> > >     1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>> > >     1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
>> > >     1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
>> > >     0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
>> > >     0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
>> > >     0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
>> > >     0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
>> > >     0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
>> > >     0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>> > >     0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
>> > >     0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
>> > > 
>> > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
>> > > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
>> > > 	bpf_prog_xxx
>> > >          bpf_xdp_redirect_map
>> > > 	xsk_map_lookup_elem
>> > >          __xsk_map_redirect
>> > > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
>> > 
>> > I don't think you're identifying the overhead correctly.
>> > xsk_map_lookup_elem is 1%
>> > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
>> > which is a different function:
>> > ffffffff81493fe0 T __xsk_map_lookup_elem
>> > ffffffff81492e80 t xsk_map_lookup_elem
>> > 
>> > 10% for bpf_prog_80b55d8a76303785 is huge.
>> > It's the actual code of the program _without_ any helpers.
>> > How does the program actually look?
>> 
>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268
>
> I see. Looks like map_gen_lookup was never implemented for xskmap.
> How about adding it first the way array_map_gen_lookup() is implemented?
> This will easily give 2x perf gain.

I guess we should implement this for devmaps as well now that we allow
lookups into those.

However, in this particular example, the lookup from BPF is not actually
needed, since bpf_redirect_map() will return a configurable error value
when the map lookup fails (for exactly this use case).

So replacing:

if (bpf_map_lookup_elem(&xsks_map, &index))
    return bpf_redirect_map(&xsks_map, index, 0);

with simply

return bpf_redirect_map(&xsks_map, index, XDP_PASS);

would save the call to xsk_map_lookup_elem().

-Toke
Björn Töpel Oct. 20, 2019, 5:12 p.m. UTC | #18
On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote:
> >> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
> >> > On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
> >> > >
> >> > > Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
> >> > > ==========================================================================
> >> > > Samples: 44K of event 'cycles', Event count (approx.): 38532389541
> >> > > Overhead  Command          Shared Object              Symbol
> >> > >    15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
> >> > >    10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
> >> > >     9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
> >> > >     8.62%  xdpsock          xdpsock                    [.] main
> >> > >     7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
> >> > >     5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
> >> > >     4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
> >> > >     3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
> >> >
> >> > why everything is duplicated?
> >> > Same code runs in different tasks ?
> >>
> >> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.
> >>
> >> >
> >> > >     2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> >> > >     2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
> >> > >     2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
> >> > >     2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
> >> > >     1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
> >> > >     1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
> >> > >     1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> >> > >     1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> >> > >     1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
> >> > >     1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
> >> > >     1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
> >> > >     0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
> >> > >     0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
> >> > >     0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
> >> > >     0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
> >> > >     0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
> >> > >     0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
> >> > >     0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> >> > >
> >> > > Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
> >> > > ==========================================================================
> >> > > Samples: 46K of event 'cycles', Event count (approx.): 38387018585
> >> > > Overhead  Command          Shared Object             Symbol
> >> > >    21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
> >> > >    14.36%  xdpsock          xdpsock                   [.] main
> >> > >    11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
> >> > >    11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
> >> > >     4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
> >> > >     2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
> >> > >     2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
> >> > >     2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
> >> > >     2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
> >> > >     2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
> >> > >     1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
> >> > >     1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
> >> > >     1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
> >> > >     1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
> >> > >     0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
> >> > >     0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
> >> > >     0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
> >> > >     0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
> >> > >     0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
> >> > >     0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
> >> > >     0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
> >> > >     0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
> >> > >
> >> > > Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
> >> > > AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
> >> > >  bpf_prog_xxx
> >> > >          bpf_xdp_redirect_map
> >> > >  xsk_map_lookup_elem
> >> > >          __xsk_map_redirect
> >> > > With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
> >> >
> >> > I don't think you're identifying the overhead correctly.
> >> > xsk_map_lookup_elem is 1%
> >> > but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
> >> > which is a different function:
> >> > ffffffff81493fe0 T __xsk_map_lookup_elem
> >> > ffffffff81492e80 t xsk_map_lookup_elem
> >> >
> >> > 10% for bpf_prog_80b55d8a76303785 is huge.
> >> > It's the actual code of the program _without_ any helpers.
> >> > How does the program actually look?
> >>
> >> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
> >> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268
> >
> > I see. Looks like map_gen_lookup was never implemented for xskmap.
> > How about adding it first the way array_map_gen_lookup() is implemented?
> > This will easily give 2x perf gain.
>
> I guess we should implement this for devmaps as well now that we allow
> lookups into those.
>
> However, in this particular example, the lookup from BPF is not actually
> needed, since bpf_redirect_map() will return a configurable error value
> when the map lookup fails (for exactly this use case).
>
> So replacing:
>
> if (bpf_map_lookup_elem(&xsks_map, &index))
>     return bpf_redirect_map(&xsks_map, index, 0);
>
> with simply
>
> return bpf_redirect_map(&xsks_map, index, XDP_PASS);
>
> would save the call to xsk_map_lookup_elem().
>

Thanks for the reminder! I just submitted a patch. Still, doing the
map_gen_lookup()  for xsk/devmaps still makes sense!


Björn


> -Toke
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan
Samudrala, Sridhar Oct. 21, 2019, 8:10 p.m. UTC | #19
On 10/20/2019 10:12 AM, Björn Töpel wrote:
> On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>
>>> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote:
>>>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
>>>>> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
>>>>>>
>>>>>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
>>>>>> ==========================================================================
>>>>>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541
>>>>>> Overhead  Command          Shared Object              Symbol
>>>>>>     15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>>>>>>     10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>>>>>>      9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>>>>>>      8.62%  xdpsock          xdpsock                    [.] main
>>>>>>      7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>>>>>>      5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>>>>>>      4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>>>>>>      3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
>>>>>
>>>>> why everything is duplicated?
>>>>> Same code runs in different tasks ?
>>>>
>>>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.
>>>>
>>>>>
>>>>>>      2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>>>>>      2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
>>>>>>      2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>>>>>>      2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>>>>>>      1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>>>>>      1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>>>>>      1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>>>>>      1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>>>>>      1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>>>>>      1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
>>>>>>      1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>>>>>>      0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>>>>>      0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>>>>>>      0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>>>>>>      0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>>>>>>      0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
>>>>>>      0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>>>>>>      0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>>>>>
>>>>>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
>>>>>> ==========================================================================
>>>>>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585
>>>>>> Overhead  Command          Shared Object             Symbol
>>>>>>     21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
>>>>>>     14.36%  xdpsock          xdpsock                   [.] main
>>>>>>     11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
>>>>>>     11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
>>>>>>      4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
>>>>>>      2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
>>>>>>      2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
>>>>>>      2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
>>>>>>      2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
>>>>>>      2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>>>>>>      1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>>>>>>      1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>>>>>>      1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
>>>>>>      1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
>>>>>>      0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
>>>>>>      0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
>>>>>>      0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
>>>>>>      0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
>>>>>>      0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
>>>>>>      0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>>>>>>      0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
>>>>>>      0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
>>>>>>
>>>>>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
>>>>>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
>>>>>>   bpf_prog_xxx
>>>>>>           bpf_xdp_redirect_map
>>>>>>   xsk_map_lookup_elem
>>>>>>           __xsk_map_redirect
>>>>>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
>>>>>
>>>>> I don't think you're identifying the overhead correctly.
>>>>> xsk_map_lookup_elem is 1%
>>>>> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
>>>>> which is a different function:
>>>>> ffffffff81493fe0 T __xsk_map_lookup_elem
>>>>> ffffffff81492e80 t xsk_map_lookup_elem
>>>>>
>>>>> 10% for bpf_prog_80b55d8a76303785 is huge.
>>>>> It's the actual code of the program _without_ any helpers.
>>>>> How does the program actually look?
>>>>
>>>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268
>>>
>>> I see. Looks like map_gen_lookup was never implemented for xskmap.
>>> How about adding it first the way array_map_gen_lookup() is implemented?
>>> This will easily give 2x perf gain.
>>
>> I guess we should implement this for devmaps as well now that we allow
>> lookups into those.
>>
>> However, in this particular example, the lookup from BPF is not actually
>> needed, since bpf_redirect_map() will return a configurable error value
>> when the map lookup fails (for exactly this use case).
>>
>> So replacing:
>>
>> if (bpf_map_lookup_elem(&xsks_map, &index))
>>      return bpf_redirect_map(&xsks_map, index, 0);
>>
>> with simply
>>
>> return bpf_redirect_map(&xsks_map, index, XDP_PASS);
>>
>> would save the call to xsk_map_lookup_elem().
>>
> 
> Thanks for the reminder! I just submitted a patch. Still, doing the
> map_gen_lookup()  for xsk/devmaps still makes sense!
> 

I tried Bjorn's patch that avoids the lookups in the BPF prog.
https://lore.kernel.org/netdev/20191021105938.11820-1-bjorn.topel@gmail.com/

With this patch I am also seeing around 3-4% increase in xdpsock rxdrop performance and
the perf report looks like this.

Samples: 44K of event 'cycles', Event count (approx.): 38749965204
Overhead  Command          Shared Object              Symbol
   16.06%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
   10.18%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
   10.15%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
   10.06%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
    7.45%  xdpsock          xdpsock                    [.] main
    5.76%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
    4.51%  xdpsock          bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
    3.67%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
    3.06%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
    2.34%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
    2.33%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
    1.69%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
    1.69%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
    1.42%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
    1.19%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
    1.13%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
    0.95%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
    0.92%  swapper          [kernel.vmlinux]           [k] intel_idle
    0.92%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
    0.80%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
    0.73%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
    0.71%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_lookup_elem
    0.63%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
    0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
    0.58%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu

So with this patch applied, direct receive performance improvement comes down from 46% to 42%.
I think it is still substantial enough to provide an option to allow direct receive for
certain use cases. If it is OK, i can re-spin and submit the patches on top of the latest bpf-next

Thanks
Sridhar
Alexei Starovoitov Oct. 21, 2019, 10:34 p.m. UTC | #20
On Mon, Oct 21, 2019 at 1:10 PM Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
> On 10/20/2019 10:12 AM, Björn Töpel wrote:
> > On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >>
> >> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
> >>
> >>> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote:
> >>>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
> >>>>> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
> >>>>>>
> >>>>>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
> >>>>>> ==========================================================================
> >>>>>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541
> >>>>>> Overhead  Command          Shared Object              Symbol
> >>>>>>     15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
> >>>>>>     10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
> >>>>>>      9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
> >>>>>>      8.62%  xdpsock          xdpsock                    [.] main
> >>>>>>      7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
> >>>>>>      5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
> >>>>>>      4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
> >>>>>>      3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
> >>>>>
> >>>>> why everything is duplicated?
> >>>>> Same code runs in different tasks ?
> >>>>
> >>>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.
> >>>>
> >>>>>
> >>>>>>      2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> >>>>>>      2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
> >>>>>>      2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
> >>>>>>      2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
> >>>>>>      1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
> >>>>>>      1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
> >>>>>>      1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> >>>>>>      1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> >>>>>>      1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
> >>>>>>      1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
> >>>>>>      1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
> >>>>>>      0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
> >>>>>>      0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
> >>>>>>      0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
> >>>>>>      0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
> >>>>>>      0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
> >>>>>>      0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
> >>>>>>      0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
> >>>>>>
> >>>>>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
> >>>>>> ==========================================================================
> >>>>>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585
> >>>>>> Overhead  Command          Shared Object             Symbol
> >>>>>>     21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
> >>>>>>     14.36%  xdpsock          xdpsock                   [.] main
> >>>>>>     11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
> >>>>>>     11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
> >>>>>>      4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
> >>>>>>      2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
> >>>>>>      2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
> >>>>>>      2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
> >>>>>>      2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
> >>>>>>      2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
> >>>>>>      1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
> >>>>>>      1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
> >>>>>>      1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
> >>>>>>      1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
> >>>>>>      0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
> >>>>>>      0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
> >>>>>>      0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
> >>>>>>      0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
> >>>>>>      0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
> >>>>>>      0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
> >>>>>>      0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
> >>>>>>      0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
> >>>>>>
> >>>>>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
> >>>>>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
> >>>>>>   bpf_prog_xxx
> >>>>>>           bpf_xdp_redirect_map
> >>>>>>   xsk_map_lookup_elem
> >>>>>>           __xsk_map_redirect
> >>>>>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
> >>>>>
> >>>>> I don't think you're identifying the overhead correctly.
> >>>>> xsk_map_lookup_elem is 1%
> >>>>> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
> >>>>> which is a different function:
> >>>>> ffffffff81493fe0 T __xsk_map_lookup_elem
> >>>>> ffffffff81492e80 t xsk_map_lookup_elem
> >>>>>
> >>>>> 10% for bpf_prog_80b55d8a76303785 is huge.
> >>>>> It's the actual code of the program _without_ any helpers.
> >>>>> How does the program actually look?
> >>>>
> >>>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
> >>>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268
> >>>
> >>> I see. Looks like map_gen_lookup was never implemented for xskmap.
> >>> How about adding it first the way array_map_gen_lookup() is implemented?
> >>> This will easily give 2x perf gain.
> >>
> >> I guess we should implement this for devmaps as well now that we allow
> >> lookups into those.
> >>
> >> However, in this particular example, the lookup from BPF is not actually
> >> needed, since bpf_redirect_map() will return a configurable error value
> >> when the map lookup fails (for exactly this use case).
> >>
> >> So replacing:
> >>
> >> if (bpf_map_lookup_elem(&xsks_map, &index))
> >>      return bpf_redirect_map(&xsks_map, index, 0);
> >>
> >> with simply
> >>
> >> return bpf_redirect_map(&xsks_map, index, XDP_PASS);
> >>
> >> would save the call to xsk_map_lookup_elem().
> >>
> >
> > Thanks for the reminder! I just submitted a patch. Still, doing the
> > map_gen_lookup()  for xsk/devmaps still makes sense!
> >
>
> I tried Bjorn's patch that avoids the lookups in the BPF prog.
> https://lore.kernel.org/netdev/20191021105938.11820-1-bjorn.topel@gmail.com/
>
> With this patch I am also seeing around 3-4% increase in xdpsock rxdrop performance and
> the perf report looks like this.
>
> Samples: 44K of event 'cycles', Event count (approx.): 38749965204
> Overhead  Command          Shared Object              Symbol
>    16.06%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>    10.18%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
>    10.15%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>    10.06%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>     7.45%  xdpsock          xdpsock                    [.] main
>     5.76%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>     4.51%  xdpsock          bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
>     3.67%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
>     3.06%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>     2.34%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>     2.33%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>     1.69%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>     1.69%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>     1.42%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>     1.19%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>     1.13%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>     0.95%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>     0.92%  swapper          [kernel.vmlinux]           [k] intel_idle
>     0.92%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>     0.80%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>     0.73%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>     0.71%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_lookup_elem
>     0.63%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>     0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>     0.58%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>
> So with this patch applied, direct receive performance improvement comes down from 46% to 42%.
> I think it is still substantial enough to provide an option to allow direct receive for
> certain use cases. If it is OK, i can re-spin and submit the patches on top of the latest bpf-next

I think it's too early to consider such drastic approach.
The run-time performance of XDP program should be the same as C code.
Something fishy in these numbers, since spending 10% cpu in few loads
and single call to bpf_xdp_redirect_map() just not right.
Samudrala, Sridhar Oct. 22, 2019, 7:06 p.m. UTC | #21
On 10/21/2019 3:34 PM, Alexei Starovoitov wrote:
> On Mon, Oct 21, 2019 at 1:10 PM Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
>>
>> On 10/20/2019 10:12 AM, Björn Töpel wrote:
>>> On Sun, 20 Oct 2019 at 12:15, Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>>>>
>>>> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>>>>
>>>>> On Fri, Oct 18, 2019 at 05:45:26PM -0700, Samudrala, Sridhar wrote:
>>>>>> On 10/18/2019 5:14 PM, Alexei Starovoitov wrote:
>>>>>>> On Fri, Oct 18, 2019 at 11:40:07AM -0700, Samudrala, Sridhar wrote:
>>>>>>>>
>>>>>>>> Perf report for "AF_XDP default rxdrop" with patched kernel - mitigations ON
>>>>>>>> ==========================================================================
>>>>>>>> Samples: 44K of event 'cycles', Event count (approx.): 38532389541
>>>>>>>> Overhead  Command          Shared Object              Symbol
>>>>>>>>      15.31%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>>>>>>>>      10.50%  ksoftirqd/28     bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>>>>>>>>       9.48%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>>>>>>>>       8.62%  xdpsock          xdpsock                    [.] main
>>>>>>>>       7.11%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>>>>>>>>       5.81%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>>>>>>>>       4.46%  xdpsock          bpf_prog_80b55d8a76303785  [k] bpf_prog_80b55d8a76303785
>>>>>>>>       3.83%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
>>>>>>>
>>>>>>> why everything is duplicated?
>>>>>>> Same code runs in different tasks ?
>>>>>>
>>>>>> Yes. looks like these functions run from both the app(xdpsock) context and ksoftirqd context.
>>>>>>
>>>>>>>
>>>>>>>>       2.81%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>>>>>>>       2.78%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_map_lookup_elem
>>>>>>>>       2.44%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>>>>>>>>       2.19%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>>>>>>>>       1.62%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>>>>>>>       1.57%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>>>>>>>       1.32%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>>>>>>>       1.28%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>>>>>>>       1.15%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>>>>>>>       1.12%  xdpsock          [kernel.vmlinux]           [k] xsk_map_lookup_elem
>>>>>>>>       1.06%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>>>>>>>>       0.94%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>>>>>>>       0.75%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>>>>>>>>       0.66%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>>>>>>>>       0.64%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>>>>>>>>       0.64%  swapper          [kernel.vmlinux]           [k] intel_idle
>>>>>>>>       0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>>>>>>>>       0.57%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>>>>>>>
>>>>>>>> Perf report for "AF_XDP direct rxdrop" with patched kernel - mitigations ON
>>>>>>>> ==========================================================================
>>>>>>>> Samples: 46K of event 'cycles', Event count (approx.): 38387018585
>>>>>>>> Overhead  Command          Shared Object             Symbol
>>>>>>>>      21.94%  ksoftirqd/28     [i40e]                    [k] i40e_clean_rx_irq_zc
>>>>>>>>      14.36%  xdpsock          xdpsock                   [.] main
>>>>>>>>      11.53%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_rcv
>>>>>>>>      11.32%  xdpsock          [i40e]                    [k] i40e_clean_rx_irq_zc
>>>>>>>>       4.02%  xdpsock          [kernel.vmlinux]          [k] xsk_rcv
>>>>>>>>       2.91%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_do_redirect
>>>>>>>>       2.45%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_peek_addr
>>>>>>>>       2.19%  xdpsock          [kernel.vmlinux]          [k] xsk_umem_peek_addr
>>>>>>>>       2.08%  ksoftirqd/28     [kernel.vmlinux]          [k] bpf_direct_xsk
>>>>>>>>       2.07%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>>>>>>>>       1.53%  ksoftirqd/28     [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>>>>>>>>       1.39%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_device
>>>>>>>>       1.22%  ksoftirqd/28     [kernel.vmlinux]          [k] xdp_get_xsk_from_qid
>>>>>>>>       1.12%  ksoftirqd/28     [i40e]                    [k] i40e_clean_programming_status
>>>>>>>>       0.96%  ksoftirqd/28     [i40e]                    [k] i40e_napi_poll
>>>>>>>>       0.95%  ksoftirqd/28     [kernel.vmlinux]          [k] net_rx_action
>>>>>>>>       0.89%  xdpsock          [kernel.vmlinux]          [k] xdp_do_redirect
>>>>>>>>       0.83%  swapper          [i40e]                    [k] i40e_clean_rx_irq_zc
>>>>>>>>       0.70%  swapper          [kernel.vmlinux]          [k] intel_idle
>>>>>>>>       0.66%  xdpsock          [kernel.vmlinux]          [k] dma_direct_sync_single_for_cpu
>>>>>>>>       0.60%  xdpsock          [kernel.vmlinux]          [k] bpf_direct_xsk
>>>>>>>>       0.50%  ksoftirqd/28     [kernel.vmlinux]          [k] xsk_umem_discard_addr
>>>>>>>>
>>>>>>>> Based on the perf reports comparing AF_XDP default and direct rxdrop, we can say that
>>>>>>>> AF_XDP direct rxdrop codepath is avoiding the overhead of going through these functions
>>>>>>>>    bpf_prog_xxx
>>>>>>>>            bpf_xdp_redirect_map
>>>>>>>>    xsk_map_lookup_elem
>>>>>>>>            __xsk_map_redirect
>>>>>>>> With AF_XDP direct, xsk_rcv() is directly called via bpf_direct_xsk() in xdp_do_redirect()
>>>>>>>
>>>>>>> I don't think you're identifying the overhead correctly.
>>>>>>> xsk_map_lookup_elem is 1%
>>>>>>> but bpf_xdp_redirect_map() suppose to call __xsk_map_lookup_elem()
>>>>>>> which is a different function:
>>>>>>> ffffffff81493fe0 T __xsk_map_lookup_elem
>>>>>>> ffffffff81492e80 t xsk_map_lookup_elem
>>>>>>>
>>>>>>> 10% for bpf_prog_80b55d8a76303785 is huge.
>>>>>>> It's the actual code of the program _without_ any helpers.
>>>>>>> How does the program actually look?
>>>>>>
>>>>>> It is the xdp program that is loaded via xsk_load_xdp_prog() in tools/lib/bpf/xsk.c
>>>>>> https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/tree/tools/lib/bpf/xsk.c#n268
>>>>>
>>>>> I see. Looks like map_gen_lookup was never implemented for xskmap.
>>>>> How about adding it first the way array_map_gen_lookup() is implemented?
>>>>> This will easily give 2x perf gain.
>>>>
>>>> I guess we should implement this for devmaps as well now that we allow
>>>> lookups into those.
>>>>
>>>> However, in this particular example, the lookup from BPF is not actually
>>>> needed, since bpf_redirect_map() will return a configurable error value
>>>> when the map lookup fails (for exactly this use case).
>>>>
>>>> So replacing:
>>>>
>>>> if (bpf_map_lookup_elem(&xsks_map, &index))
>>>>       return bpf_redirect_map(&xsks_map, index, 0);
>>>>
>>>> with simply
>>>>
>>>> return bpf_redirect_map(&xsks_map, index, XDP_PASS);
>>>>
>>>> would save the call to xsk_map_lookup_elem().
>>>>
>>>
>>> Thanks for the reminder! I just submitted a patch. Still, doing the
>>> map_gen_lookup()  for xsk/devmaps still makes sense!
>>>
>>
>> I tried Bjorn's patch that avoids the lookups in the BPF prog.
>> https://lore.kernel.org/netdev/20191021105938.11820-1-bjorn.topel@gmail.com/
>>
>> With this patch I am also seeing around 3-4% increase in xdpsock rxdrop performance and
>> the perf report looks like this.
>>
>> Samples: 44K of event 'cycles', Event count (approx.): 38749965204
>> Overhead  Command          Shared Object              Symbol
>>     16.06%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>>     10.18%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
>>     10.15%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>>     10.06%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>>      7.45%  xdpsock          xdpsock                    [.] main
>>      5.76%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>>      4.51%  xdpsock          bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
>>      3.67%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
>>      3.06%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>      2.34%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
>>      2.33%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
>>      1.69%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>      1.69%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
>>      1.42%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>      1.19%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>>      1.13%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>      0.95%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
>>      0.92%  swapper          [kernel.vmlinux]           [k] intel_idle
>>      0.92%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
>>      0.80%  ksoftirqd/28     [kernel.vmlinux]           [k] __x86_indirect_thunk_rax
>>      0.73%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
>>      0.71%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_lookup_elem
>>      0.63%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action
>>      0.62%  ksoftirqd/28     [i40e]                     [k] i40e_napi_poll
>>      0.58%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
>>
>> So with this patch applied, direct receive performance improvement comes down from 46% to 42%.
>> I think it is still substantial enough to provide an option to allow direct receive for
>> certain use cases. If it is OK, i can re-spin and submit the patches on top of the latest bpf-next
> 
> I think it's too early to consider such drastic approach.
> The run-time performance of XDP program should be the same as C code.
> Something fishy in these numbers, since spending 10% cpu in few loads
> and single call to bpf_xdp_redirect_map() just not right.

OK. Here is another data point that shows the perf report with the same test but CPU mitigations
turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%).

   21.40%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
   14.13%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
    8.33%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
    6.09%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
    5.19%  xdpsock          xdpsock                    [.] main
    3.48%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
    3.23%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
    3.06%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_redirect
    2.72%  xdpsock          [kernel.vmlinux]           [k] xdp_do_redirect
    2.27%  xdpsock          [kernel.vmlinux]           [k] xsk_rcv
    2.10%  xdpsock          [kernel.vmlinux]           [k] xsk_umem_peek_addr
    2.09%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_umem_peek_addr
    1.89%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
    1.44%  xdpsock          bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
    1.36%  xdpsock          [kernel.vmlinux]           [k] bpf_xdp_redirect_map
    1.31%  ksoftirqd/28     [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
    1.30%  xdpsock          [kernel.vmlinux]           [k] __xsk_map_redirect
    1.23%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_device
    0.97%  ksoftirqd/28     [kernel.vmlinux]           [k] __xsk_map_lookup_elem
    0.90%  ksoftirqd/28     [i40e]                     [k] i40e_clean_programming_status
    0.81%  xdpsock          [kernel.vmlinux]           [k] dma_direct_sync_single_for_cpu
    0.76%  swapper          [i40e]                     [k] i40e_clean_rx_irq_zc
    0.75%  swapper          [kernel.vmlinux]           [k] intel_idle
    0.59%  ksoftirqd/28     [kernel.vmlinux]           [k] net_rx_action

So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations.
The other component is the bpf_xdp_redirect_map() codepath.

Let me know if it helps to collect any other data that should further help with the perf analysis.
Alexei Starovoitov Oct. 23, 2019, 5:42 p.m. UTC | #22
On Tue, Oct 22, 2019 at 12:06 PM Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
> OK. Here is another data point that shows the perf report with the same test but CPU mitigations
> turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%).
>
>    21.40%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
>    14.13%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
>     8.33%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
>     6.09%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
>     5.19%  xdpsock          xdpsock                    [.] main
>     3.48%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
>     3.23%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
>
> So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations.

I feel that it's an incorrect conclusion because JIT is not doing
any retpolines (because there are no indirect calls in bpf).
There should be no difference in bpf_prog runtime with or without mitigations.
Also you're running root, so no spectre mitigations either.

This 3% seems like a lot for a function that does few loads that should
hit d-cache and one direct call.
Please investigate why you're seeing this 10% cpu cost when mitigations are on.
perf report/annotate is the best.
Also please double check that you're using the latest perf.
Since bpf performance analysis was greatly improved several versions ago.
I don't think old perf will be showing bogus numbers, but better to
run the latest.

> The other component is the bpf_xdp_redirect_map() codepath.
>
> Let me know if it helps to collect any other data that should further help with the perf analysis.
>
Samudrala, Sridhar Oct. 24, 2019, 6:12 p.m. UTC | #23
> > OK. Here is another data point that shows the perf report with the same test but CPU mitigations
> > turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%).
> >
> >    21.40%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
> >    14.13%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
> >     8.33%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
> >     6.09%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
> >     5.19%  xdpsock          xdpsock                    [.] main
> >     3.48%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> >     3.23%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
> >
> > So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations.

> I feel that it's an incorrect conclusion because JIT is not doing
> any retpolines (because there are no indirect calls in bpf).
> There should be no difference in bpf_prog runtime with or without mitigations.
> Also you're running root, so no spectre mitigations either.

> This 3% seems like a lot for a function that does few loads that should
> hit d-cache and one direct call.
> Please investigate why you're seeing this 10% cpu cost when mitigations are on.
> perf report/annotate is the best.
> Also please double check that you're using the latest perf.
> Since bpf performance analysis was greatly improved several versions ago.
> I don't think old perf will be showing bogus numbers, but better to
> run the latest.

Here is perf annotate output for bpf_prog_ with and without mitigations turned ON
Using the perf built from the bpf-next tree.
   perf version 5.3.g4071324a76c1

With mitigations ON
-------------------
Samples: 6K of event 'cycles', 4000 Hz, Event count (approx.): 5646512726
bpf_prog_3c8251c7e0fef8db  bpf_prog_3c8251c7e0fef8db [Percent: local period]
  45.05      push   %rbp
   0.02      mov    %rsp,%rbp
   0.03      sub    $0x8,%rsp
  22.09      push   %rbx
   7.66      push   %r13
   1.08      push   %r14
   1.85      push   %r15
   0.63      pushq  $0x0
   1.13      mov    0x28(%rdi),%rsi
   0.47      mov    0x8(%rsi),%esi
   3.47      mov    %esi,-0x4(%rbp)
   0.02      movabs $0xffff8ab414a83e00,%rdi
   0.90      mov    $0x2,%edx
   2.85      callq  *ffffffffd149fc5f
   1.55      and    $0x6,%rax
             test   %rax,%rax
   1.48      jne    72
             mov    %rbp,%rsi
             add    $0xfffffffffffffffc,%rsi
             movabs $0xffff8ab414a83e00,%rdi
             callq  *ffffffffd0e5fd5f
             mov    %rax,%rdi
             mov    $0x2,%eax
             test   %rdi,%rdi
             je     72
             mov    -0x4(%rbp),%esi
             movabs $0xffff8ab414a83e00,%rdi
             xor    %edx,%edx
             callq  *ffffffffd149fc5f
        72:  pop    %rbx
             pop    %r15
   1.90      pop    %r14
   1.93      pop    %r13
             pop    %rbx
   3.63      leaveq
   2.27      retq

With mitigations OFF
--------------------
Samples: 2K of event 'cycles', 4000 Hz, Event count (approx.): 1872116166
bpf_prog_3c8251c7e0fef8db  bpf_prog_3c8251c7e0fef8db [Percent: local period]
   0.15      push   %rbp
             mov    %rsp,%rbp
  13.79      sub    $0x8,%rsp
   0.30      push   %rbx
   0.15      push   %r13
   0.20      push   %r14
  14.50      push   %r15
   0.20      pushq  $0x0
             mov    0x28(%rdi),%rsi
   0.25      mov    0x8(%rsi),%esi
  14.37      mov    %esi,-0x4(%rbp)
   0.25      movabs $0xffff8ea2c673b800,%rdi
             mov    $0x2,%edx
  13.60      callq  *ffffffffe50c2f38
  14.33      and    $0x6,%rax
             test   %rax,%rax
             jne    72
             mov    %rbp,%rsi
             add    $0xfffffffffffffffc,%rsi
             movabs $0xffff8ea2c673b800,%rdi
             callq  *ffffffffe4a83038
             mov    %rax,%rdi
             mov    $0x2,%eax
             test   %rdi,%rdi
             je     72
             mov    -0x4(%rbp),%esi
             movabs $0xffff8ea2c673b800,%rdi
             xor    %edx,%edx
             callq  *ffffffffe50c2f38
        72:  pop    %rbx
             pop    %r15
  13.97      pop    %r14
   0.10      pop    %r13
             pop    %rbx
  13.71      leaveq
   0.15      retq

Do you see any issues with this data? With mitigations ON push %rbp and push %rbx overhead seems to
be pretty high.

Thanks
Sridhar
Björn Töpel Oct. 25, 2019, 7:42 a.m. UTC | #24
On Thu, 24 Oct 2019 at 20:12, Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
[...]
>
> With mitigations ON
> -------------------
> Samples: 6K of event 'cycles', 4000 Hz, Event count (approx.): 5646512726
> bpf_prog_3c8251c7e0fef8db  bpf_prog_3c8251c7e0fef8db [Percent: local period]
>   45.05      push   %rbp
>    0.02      mov    %rsp,%rbp
>    0.03      sub    $0x8,%rsp
>   22.09      push   %rbx

[...]

>
> Do you see any issues with this data? With mitigations ON push %rbp and push %rbx overhead seems to
> be pretty high.

That's sample skid from the retpoline thunk when entring the XDP
program. Pretty expensive push otherwise! :-)

Another thought; Disclaimer: I'm no spectrev2 expert, and probably not
getting the mitigations well enough. So this is me trying to swim at
the deep end! Would it be possible to avoid the retpoline when
entering the XDP program. At least for some XDP program that can be
proved "safe"? If so, PeterZ's upcoming static_call could be used from
the driver side.


Björn
Björn Töpel Oct. 25, 2019, 9:07 a.m. UTC | #25
On Wed, 23 Oct 2019 at 19:42, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Oct 22, 2019 at 12:06 PM Samudrala, Sridhar
> <sridhar.samudrala@intel.com> wrote:
> >
> > OK. Here is another data point that shows the perf report with the same test but CPU mitigations
> > turned OFF. Here bpf_prog overhead goes down from almost (10.18 + 4.51)% to (3.23 + 1.44%).
> >
> >    21.40%  ksoftirqd/28     [i40e]                     [k] i40e_clean_rx_irq_zc
> >    14.13%  xdpsock          [i40e]                     [k] i40e_clean_rx_irq_zc
> >     8.33%  ksoftirqd/28     [kernel.vmlinux]           [k] xsk_rcv
> >     6.09%  ksoftirqd/28     [kernel.vmlinux]           [k] xdp_do_redirect
> >     5.19%  xdpsock          xdpsock                    [.] main
> >     3.48%  ksoftirqd/28     [kernel.vmlinux]           [k] bpf_xdp_redirect_map
> >     3.23%  ksoftirqd/28     bpf_prog_3c8251c7e0fef8db  [k] bpf_prog_3c8251c7e0fef8db
> >
> > So a major component of the bpf_prog overhead seems to be due to the CPU vulnerability mitigations.
>
> I feel that it's an incorrect conclusion because JIT is not doing
> any retpolines (because there are no indirect calls in bpf).
> There should be no difference in bpf_prog runtime with or without mitigations.
> Also you're running root, so no spectre mitigations either.
>
> This 3% seems like a lot for a function that does few loads that should
> hit d-cache and one direct call.
> Please investigate why you're seeing this 10% cpu cost when mitigations are on.
> perf report/annotate is the best.
> Also please double check that you're using the latest perf.
> Since bpf performance analysis was greatly improved several versions ago.
> I don't think old perf will be showing bogus numbers, but better to
> run the latest.
>

For comparison, on my Skylake 3GHz with mitigations off. (I have one
internal patch that inlines xsk_rcv() into __xsk_map_redirect, so
that's why it's not showing xsk_rcv(). I'll upstream that...)

  41.79%  [kernel]                   [k] i40e_clean_rx_irq_zc
  15.55%  [kernel]                   [k] __xsk_map_redirect
   9.87%  [kernel]                   [k] xdp_do_redirect
   6.89%  [kernel]                   [k] xsk_umem_peek_addr
   6.37%  [kernel]                   [k] bpf_xdp_redirect_map
   5.02%  bpf_prog_992d9ddc835e5629  [k] bpf_prog_992d9ddc835e5629

Again, it might look weird that simple functions like
bpf_xdp_redirect_map and the XDP program is 6% and 5%.

Let's dig into that. I let the xdpsock program (rxdrop) run on one
core 22, and the ksoftirqd on core 20. Core 20 is only processing
packets, plus the regular kernel householding. I did a processor trace
for core 20 for 207 936 packets.

In total it's 84,407,427 instructions, and bpf_xdp_redirect_map() is
8,109,504 instructions, which is 9.6%. bpf_xdp_redirect_map() executes
39 instructions for AF_XDP. As perf is reporting less than 9.6% means
that the IPC count of that function is more than the average which
perf-stat reports as IPC of 2.88.

The BPF program executes fewer instructions than
bpf_xdp_redirect_map(), so given that perf shows 5%, means that the
IPC count is better than average here as well.

So, it's roughly 405 instructions per packet, and with an IPC of 2.88
that'll give ~140 cycles per packet, which on this machine
(3,000,000,000/140) is ~21.4 Mpps. The xdpsock application reports
21. This is sane.

The TL;DR version is: 6% and 5% for bpf_xdp_redirect_map and
bpf_prog_992d9ddc835e5629 might seem high, but it's just that the
total number of instruction executing is fairly low. So, even though
the functions are small in size, it will show up as non-negligible
percentage in perf.

At these speeds, really small things have an impact on
performance. DPDK has ~50 cycles per packet.


Björn



> > The other component is the bpf_xdp_redirect_map() codepath.
> >
> > Let me know if it helps to collect any other data that should further help with the perf analysis.
> >
Samudrala, Sridhar Oct. 31, 2019, 10:38 p.m. UTC | #26
[...]
> >
> > With mitigations ON
> > -------------------
> > Samples: 6K of event 'cycles', 4000 Hz, Event count (approx.): 5646512726
> > bpf_prog_3c8251c7e0fef8db  bpf_prog_3c8251c7e0fef8db [Percent: local period]
> >   45.05      push   %rbp
> >    0.02      mov    %rsp,%rbp
> >    0.03      sub    $0x8,%rsp
> >   22.09      push   %rbx

> [...]
> >
> > Do you see any issues with this data? With mitigations ON push %rbp and push %rbx overhead seems to
> > be pretty high.

> That's sample skid from the retpoline thunk when entring the XDP
> program. Pretty expensive push otherwise! :-)

> Another thought; Disclaimer: I'm no spectrev2 expert, and probably not
> getting the mitigations well enough. So this is me trying to swim at
> the deep end! Would it be possible to avoid the retpoline when
> entering the XDP program. At least for some XDP program that can be
> proved "safe"? If so, PeterZ's upcoming static_call could be used from
> the driver side.

Alexei, Jakub

Do you think it will be possible to avoid this overhead when mitigations are turned ON?
The other part of the overhead is going through the redirect path.

Can i assume that your silence as an indication that you are now okay with optional bypass
flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit
the patches against the latest bpf-next

-Sridhar
Alexei Starovoitov Oct. 31, 2019, 11:15 p.m. UTC | #27
On Thu, Oct 31, 2019 at 3:38 PM Samudrala, Sridhar
<sridhar.samudrala@intel.com> wrote:
>
> Alexei, Jakub
>
> Do you think it will be possible to avoid this overhead when mitigations are turned ON?

yes

> The other part of the overhead is going through the redirect path.
>
> Can i assume that your silence as an indication that you are now okay with optional bypass
> flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit
> the patches against the latest bpf-next

I'm still against it.
Looks like the only motivation for it is extra overhead due to retpolines.
imo it's not a good reason to introduce a bunch of extra code helping
single kernel feature.
We will have proper solution for indirect calls.
Jakub Kicinski Nov. 1, 2019, 12:21 a.m. UTC | #28
On Thu, 31 Oct 2019 15:38:42 -0700, Samudrala, Sridhar wrote:
> Do you think it will be possible to avoid this overhead when mitigations are turned ON?
> The other part of the overhead is going through the redirect path.

Yes, you should help Maciej with the XDP bulking.

> Can i assume that your silence as an indication that you are now okay with optional bypass
> flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit
> the patches against the latest bpf-next

This logic baffles me. I absolutely hate when people repost patches
after I nack them without even as much as mentioning my objections in
the cover letter.

My concern was that we want the applications to encode fast path logic
in BPF and load that into the kernel. So your patch works fundamentally
against that goal:

I worry that with the volume of patches that get posted each day
objections of a measly contributor like myself will get forgotten 
if I don't reply to each posting, yet replying each time makes me look
bullish or whatnot (apart from being an utter waste of time).

Ugh.
Samudrala, Sridhar Nov. 1, 2019, 6:31 p.m. UTC | #29
On 10/31/2019 5:21 PM, Jakub Kicinski wrote:
> On Thu, 31 Oct 2019 15:38:42 -0700, Samudrala, Sridhar wrote:
>> Do you think it will be possible to avoid this overhead when mitigations are turned ON?
>> The other part of the overhead is going through the redirect path.
> 
> Yes, you should help Maciej with the XDP bulking.
> 
>> Can i assume that your silence as an indication that you are now okay with optional bypass
>> flag as long as it doesn't effect the normal XDP datapath. If so, i will respin and submit
>> the patches against the latest bpf-next
> 
> This logic baffles me. I absolutely hate when people repost patches
> after I nack them without even as much as mentioning my objections in
> the cover letter.

Sorry if you got the impression that i didn't take your feedback. I CCed you
and also included the kernel rxdrop data that you requested in the original
series.

> 
> My concern was that we want the applications to encode fast path logic
> in BPF and load that into the kernel. So your patch works fundamentally
> against that goal:

So looks like you are saying that the fundamental requirement is that all AF_XDP
packets need to go via a BPF program.

The reason i proposed direct receive is because of the overhead we are seeing
with going via BPF program for apps that want to receive all the packets on a
specific queue.

I agree that there is work going on to reduce this overhead with bulking and avoiding
the retpoline.
We can revisit after these optimizations get in and then see if it is still useful
to provide a direct receive option.

-Sridhar
Dan Siemon Nov. 4, 2019, 2:08 a.m. UTC | #30
On Thu, 2019-10-31 at 17:21 -0700, Jakub Kicinski wrote:
> 
> My concern was that we want the applications to encode fast path
> logic
> in BPF and load that into the kernel. So your patch works
> fundamentally
> against that goal:

I'm only one AF_XDP user but my main goal is to get packets to
userspace as fast as possible. I don't (currently) need a BPF program
in the path at all. I suspect that many other people that look at
AF_XDP as a DPDK replacement have a similar view.
diff mbox series

Patch

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 2ce57645f3cd..db4ad85d8321 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -585,6 +585,9 @@  struct bpf_redirect_info {
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
 	u32 kern_flags;
+#ifdef CONFIG_XDP_SOCKETS
+	struct xdp_sock *xsk;
+#endif
 };
 
 DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
@@ -693,6 +696,16 @@  static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
 	return res;
 }
 
+#ifdef CONFIG_XDP_SOCKETS
+#define BPF_PROG_DIRECT_XSK	0x1
+#define bpf_is_direct_xsk_prog(prog) \
+	((unsigned long)prog == BPF_PROG_DIRECT_XSK)
+DECLARE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
+u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp);
+#else
+#define bpf_is_direct_xsk_prog(prog) (false)
+#endif
+
 static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 					    struct xdp_buff *xdp)
 {
@@ -702,6 +715,11 @@  static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	 * already takes rcu_read_lock() when fetching the program, so
 	 * it's not necessary here anymore.
 	 */
+#ifdef CONFIG_XDP_SOCKETS
+	if (static_branch_unlikely(&xdp_direct_xsk_needed) &&
+	    bpf_is_direct_xsk_prog(prog))
+		return bpf_direct_xsk(prog, xdp);
+#endif
 	return BPF_PROG_RUN(prog, xdp);
 }
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 48cc71aae466..f4d0f70aa718 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -743,6 +743,7 @@  struct netdev_rx_queue {
 	struct xdp_rxq_info		xdp_rxq;
 #ifdef CONFIG_XDP_SOCKETS
 	struct xdp_umem                 *umem;
+	struct xdp_sock			*xsk;
 #endif
 } ____cacheline_aligned_in_smp;
 
@@ -1836,6 +1837,10 @@  struct net_device {
 	atomic_t		carrier_up_count;
 	atomic_t		carrier_down_count;
 
+#ifdef CONFIG_XDP_SOCKETS
+	u16			direct_xsk_count;
+#endif
+
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *wireless_handlers;
 	struct iw_public_data	*wireless_data;
@@ -3712,6 +3717,11 @@  int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
 			const struct sk_buff *skb);
 
+#ifdef CONFIG_XDP_SOCKETS
+int dev_set_direct_xsk_prog(struct net_device *dev);
+int dev_clear_direct_xsk_prog(struct net_device *dev);
+#endif
+
 static __always_inline int ____dev_forward_skb(struct net_device *dev,
 					       struct sk_buff *skb)
 {
diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index c9398ce7960f..9158233d34e1 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -76,6 +76,9 @@  struct xsk_map_node {
 	struct xdp_sock **map_entry;
 };
 
+/* Flags for the xdp_sock flags field. */
+#define XDP_SOCK_DIRECT (1 << 0)
+
 struct xdp_sock {
 	/* struct sock must be the first member of struct xdp_sock */
 	struct sock sk;
@@ -104,6 +107,7 @@  struct xdp_sock {
 	struct list_head map_list;
 	/* Protects map_list */
 	spinlock_t map_list_lock;
+	u16 flags;
 };
 
 struct xdp_buff;
@@ -129,6 +133,7 @@  void xsk_set_tx_need_wakeup(struct xdp_umem *umem);
 void xsk_clear_rx_need_wakeup(struct xdp_umem *umem);
 void xsk_clear_tx_need_wakeup(struct xdp_umem *umem);
 bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem);
+struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id);
 
 void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs,
 			     struct xdp_sock **map_entry);
diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index be328c59389d..d202b5d40859 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -25,6 +25,11 @@ 
  * application.
  */
 #define XDP_USE_NEED_WAKEUP (1 << 3)
+/* This option allows an AF_XDP socket bound to a queue to receive all
+ * the packets directly from that queue when there is no XDP program
+ * attached to the device.
+ */
+#define XDP_DIRECT	(1 << 4)
 
 /* Flags for xsk_umem_config flags */
 #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 205f95af67d2..871d738a78d2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1349,13 +1349,14 @@  static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 
 void bpf_prog_put(struct bpf_prog *prog)
 {
-	__bpf_prog_put(prog, true);
+	if (!bpf_is_direct_xsk_prog(prog))
+		__bpf_prog_put(prog, true);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
 u32 bpf_get_prog_id(const struct bpf_prog *prog)
 {
-	if (prog)
+	if (prog && !bpf_is_direct_xsk_prog(prog))
 		return prog->aux->id;
 
 	return 0;
@@ -1364,7 +1365,7 @@  EXPORT_SYMBOL(bpf_get_prog_id);
 
 void bpf_set_prog_id(struct bpf_prog *prog, u32 id)
 {
-	if (prog)
+	if (prog && !bpf_is_direct_xsk_prog(prog))
 		prog->aux->id = id;
 }
 EXPORT_SYMBOL(bpf_set_prog_id);
diff --git a/net/core/dev.c b/net/core/dev.c
index 866d0ad936a5..eb3cd718e580 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8269,6 +8269,10 @@  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 	} else {
 		if (!__dev_xdp_query(dev, bpf_op, query))
 			return 0;
+#ifdef CONFIG_XDP_SOCKETS
+		if (dev->direct_xsk_count)
+			prog = (void *)BPF_PROG_DIRECT_XSK;
+#endif
 	}
 
 	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
@@ -8278,6 +8282,52 @@  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 	return err;
 }
 
+#ifdef CONFIG_XDP_SOCKETS
+int dev_set_direct_xsk_prog(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct bpf_prog *prog;
+	bpf_op_t bpf_op;
+
+	ASSERT_RTNL();
+
+	dev->direct_xsk_count++;
+
+	bpf_op = ops->ndo_bpf;
+	if (!bpf_op)
+		return -EOPNOTSUPP;
+
+	if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
+		return 0;
+
+	prog = (void *)BPF_PROG_DIRECT_XSK;
+
+	return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, prog);
+}
+
+int dev_clear_direct_xsk_prog(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	bpf_op_t bpf_op;
+
+	ASSERT_RTNL();
+
+	dev->direct_xsk_count--;
+
+	if (dev->direct_xsk_count)
+		return 0;
+
+	bpf_op = ops->ndo_bpf;
+	if (!bpf_op)
+		return -EOPNOTSUPP;
+
+	if (__dev_xdp_query(dev, bpf_op, XDP_QUERY_PROG))
+		return 0;
+
+	return dev_xdp_install(dev, bpf_op, NULL, XDP_FLAGS_DRV_MODE, NULL);
+}
+#endif
+
 /**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
diff --git a/net/core/filter.c b/net/core/filter.c
index ed6563622ce3..391d7d600284 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@ 
 #include <net/lwtunnel.h>
 #include <net/ipv6_stubs.h>
 #include <net/bpf_sk_storage.h>
+#include <linux/static_key.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -3546,6 +3547,22 @@  static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 	return 0;
 }
 
+#ifdef CONFIG_XDP_SOCKETS
+static void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
+{
+	struct xdp_sock *xsk = READ_ONCE(ri->xsk);
+
+	if (xsk) {
+		ri->xsk = NULL;
+		xsk_flush(xsk);
+	}
+}
+#else
+static inline void xdp_do_flush_xsk(struct bpf_redirect_info *ri)
+{
+}
+#endif
+
 void xdp_do_flush_map(void)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@@ -3568,6 +3585,8 @@  void xdp_do_flush_map(void)
 			break;
 		}
 	}
+
+	xdp_do_flush_xsk(ri);
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
@@ -3631,11 +3650,28 @@  static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	return err;
 }
 
+#ifdef CONFIG_XDP_SOCKETS
+static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri)
+{
+	return READ_ONCE(ri->xsk);
+}
+#else
+static inline struct xdp_sock *xdp_get_direct_xsk(struct bpf_redirect_info *ri)
+{
+	return NULL;
+}
+#endif
+
 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 	struct bpf_map *map = READ_ONCE(ri->map);
+	struct xdp_sock *xsk;
+
+	xsk = xdp_get_direct_xsk(ri);
+	if (xsk)
+		return xsk_rcv(xsk, xdp);
 
 	if (likely(map))
 		return xdp_do_redirect_map(dev, xdp, xdp_prog, map, ri);
@@ -8934,4 +8970,26 @@  const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
 
 const struct bpf_prog_ops sk_reuseport_prog_ops = {
 };
+
+#ifdef CONFIG_XDP_SOCKETS
+DEFINE_STATIC_KEY_FALSE(xdp_direct_xsk_needed);
+EXPORT_SYMBOL_GPL(xdp_direct_xsk_needed);
+
+u32 bpf_direct_xsk(const struct bpf_prog *prog, struct xdp_buff *xdp)
+{
+	struct xdp_sock *xsk;
+
+	xsk = xdp_get_xsk_from_qid(xdp->rxq->dev, xdp->rxq->queue_index);
+	if (xsk) {
+		struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+
+		ri->xsk = xsk;
+		return XDP_REDIRECT;
+	}
+
+	return XDP_PASS;
+}
+EXPORT_SYMBOL(bpf_direct_xsk);
+#endif
+
 #endif /* CONFIG_INET */
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index fa8fbb8fa3c8..8a29939bac7e 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -24,6 +24,7 @@ 
 #include <linux/rculist.h>
 #include <net/xdp_sock.h>
 #include <net/xdp.h>
+#include <linux/if_link.h>
 
 #include "xsk_queue.h"
 #include "xdp_umem.h"
@@ -264,6 +265,45 @@  int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 	return err;
 }
 
+static void xdp_clear_direct_xsk(struct xdp_sock *xsk)
+{
+	struct net_device *dev = xsk->dev;
+	u32 qid = xsk->queue_id;
+
+	if (!dev->_rx[qid].xsk)
+		return;
+
+	dev_clear_direct_xsk_prog(dev);
+	dev->_rx[qid].xsk = NULL;
+	static_branch_dec(&xdp_direct_xsk_needed);
+	xsk->flags &= ~XDP_SOCK_DIRECT;
+}
+
+static int xdp_set_direct_xsk(struct xdp_sock *xsk)
+{
+	struct net_device *dev = xsk->dev;
+	u32 qid = xsk->queue_id;
+	int err = 0;
+
+	if (dev->_rx[qid].xsk)
+		return -EEXIST;
+
+	xsk->flags |= XDP_SOCK_DIRECT;
+	static_branch_inc(&xdp_direct_xsk_needed);
+	dev->_rx[qid].xsk = xsk;
+	err = dev_set_direct_xsk_prog(dev);
+	if (err)
+		xdp_clear_direct_xsk(xsk);
+
+	return err;
+}
+
+struct xdp_sock *xdp_get_xsk_from_qid(struct net_device *dev, u16 queue_id)
+{
+	return dev->_rx[queue_id].xsk;
+}
+EXPORT_SYMBOL(xdp_get_xsk_from_qid);
+
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries)
 {
 	xskq_produce_flush_addr_n(umem->cq, nb_entries);
@@ -464,6 +504,11 @@  static void xsk_unbind_dev(struct xdp_sock *xs)
 		return;
 	WRITE_ONCE(xs->state, XSK_UNBOUND);
 
+	if (xs->flags & XDP_SOCK_DIRECT) {
+		rtnl_lock();
+		xdp_clear_direct_xsk(xs);
+		rtnl_unlock();
+	}
 	/* Wait for driver to stop using the xdp socket. */
 	xdp_del_sk_umem(xs->umem, xs);
 	xs->dev = NULL;
@@ -604,7 +649,7 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 
 	flags = sxdp->sxdp_flags;
 	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
-		      XDP_USE_NEED_WAKEUP))
+		      XDP_USE_NEED_WAKEUP | XDP_DIRECT))
 		return -EINVAL;
 
 	rtnl_lock();
@@ -632,7 +677,7 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		struct socket *sock;
 
 		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
-		    (flags & XDP_USE_NEED_WAKEUP)) {
+		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_DIRECT)) {
 			/* Cannot specify flags for shared sockets. */
 			err = -EINVAL;
 			goto out_unlock;
@@ -688,6 +733,8 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask);
 	xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask);
 	xdp_add_sk_umem(xs->umem, xs);
+	if (flags & XDP_DIRECT)
+		err = xdp_set_direct_xsk(xs);
 
 out_unlock:
 	if (err) {
diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h
index be328c59389d..d202b5d40859 100644
--- a/tools/include/uapi/linux/if_xdp.h
+++ b/tools/include/uapi/linux/if_xdp.h
@@ -25,6 +25,11 @@ 
  * application.
  */
 #define XDP_USE_NEED_WAKEUP (1 << 3)
+/* This option allows an AF_XDP socket bound to a queue to receive all
+ * the packets directly from that queue when there is no XDP program
+ * attached to the device.
+ */
+#define XDP_DIRECT	(1 << 4)
 
 /* Flags for xsk_umem_config flags */
 #define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0)