diff mbox series

[RFC,6/9] veth: Add ndo_xdp_xmit

Message ID 20180424143923.26519-7-toshiaki.makita1@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show
Series veth: Driver XDP | expand

Commit Message

Toshiaki Makita April 24, 2018, 2:39 p.m. UTC
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>

This allows NIC's XDP to redirect packets to veth. The destination veth
device enqueues redirected packets to the napi ring of its peer, then
they are processed by XDP on its peer veth device.
This can be thought as calling another XDP program by XDP program using
REDIRECT, when the peer enables driver XDP.

Note that whether an XDP program is loaded on the redirect target veth
device does not affect how xdp_frames sent by ndo_xdp_xmit is handled,
since the ring sits in rx (peer) side. Instead, whether XDP program is
loaded on peer veth does.

When peer veth device has driver XDP, ndo_xdp_xmit forwards xdp_frames
to its peer without modification.
If not, ndo_xdp_xmit converts xdp_frames to skb on sender side and
invokes netif_rx rather than dropping them. Although this will not
result in good performance, I'm thinking dropping redirected packets
when XDP is not loaded on the peer device is too restrictive, so added
this fallback.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/veth.c     | 73 +++++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/filter.h | 16 +++++++++++
 net/core/filter.c      | 11 +-------
 3 files changed, 87 insertions(+), 13 deletions(-)

Comments

Jesper Dangaard Brouer April 25, 2018, 8:24 p.m. UTC | #1
On Tue, 24 Apr 2018 23:39:20 +0900
Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:

> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
> +{
> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> +	int headroom = frame->data - (void *)frame;
> +	struct net_device *rcv;
> +	int err = 0;
> +
> +	rcv = rcu_dereference(priv->peer);
> +	if (unlikely(!rcv))
> +		return -ENXIO;
> +
> +	rcv_priv = netdev_priv(rcv);
> +	/* xdp_ring is initialized on receive side? */
> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
> +		err = xdp_ok_fwd_dev(rcv, frame->len);
> +		if (unlikely(err))
> +			return err;
> +
> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
> +	} else {
> +		struct sk_buff *skb;
> +
> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
> +		if (unlikely(!skb))
> +			return -ENOMEM;
> +
> +		/* Get page ref in case skb is dropped in netif_rx.
> +		 * The caller is responsible for freeing the page on error.
> +		 */
> +		get_page(virt_to_page(frame->data));

I'm not sure you can make this assumption, that xdp_frames coming from
another device driver uses a refcnt based memory model. But maybe I'm
confused, as this looks like an SKB receive path, but in the
ndo_xdp_xmit().


> +		if (unlikely(veth_forward_skb(rcv, skb, false) != NET_RX_SUCCESS))
> +			return -ENXIO;
> +
> +		/* Put page ref on success */
> +		page_frag_free(frame->data);
> +	}
> +
> +	return err;
> +}
Toshiaki Makita April 26, 2018, 10:52 a.m. UTC | #2
On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:
> On Tue, 24 Apr 2018 23:39:20 +0900
> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
> 
>> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
>> +{
>> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>> +	int headroom = frame->data - (void *)frame;
>> +	struct net_device *rcv;
>> +	int err = 0;
>> +
>> +	rcv = rcu_dereference(priv->peer);
>> +	if (unlikely(!rcv))
>> +		return -ENXIO;
>> +
>> +	rcv_priv = netdev_priv(rcv);
>> +	/* xdp_ring is initialized on receive side? */
>> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
>> +		err = xdp_ok_fwd_dev(rcv, frame->len);
>> +		if (unlikely(err))
>> +			return err;
>> +
>> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
>> +	} else {
>> +		struct sk_buff *skb;
>> +
>> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
>> +		if (unlikely(!skb))
>> +			return -ENOMEM;
>> +
>> +		/* Get page ref in case skb is dropped in netif_rx.
>> +		 * The caller is responsible for freeing the page on error.
>> +		 */
>> +		get_page(virt_to_page(frame->data));
> 
> I'm not sure you can make this assumption, that xdp_frames coming from
> another device driver uses a refcnt based memory model. But maybe I'm
> confused, as this looks like an SKB receive path, but in the
> ndo_xdp_xmit().

I find this path similar to cpumap, which creates skb from redirected
xdp frame. Once it is converted to skb, skb head is freed by
page_frag_free, so anyway I needed to get the refcount here regardless
of memory model.

> 
> 
>> +		if (unlikely(veth_forward_skb(rcv, skb, false) != NET_RX_SUCCESS))
>> +			return -ENXIO;
>> +
>> +		/* Put page ref on success */
>> +		page_frag_free(frame->data);
>> +	}
>> +
>> +	return err;
>> +}
> 
>
Jesper Dangaard Brouer April 30, 2018, 5:27 p.m. UTC | #3
On Thu, 26 Apr 2018 19:52:40 +0900
Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:

> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:
> > On Tue, 24 Apr 2018 23:39:20 +0900
> > Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
> >   
> >> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
> >> +{
> >> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> >> +	int headroom = frame->data - (void *)frame;
> >> +	struct net_device *rcv;
> >> +	int err = 0;
> >> +
> >> +	rcv = rcu_dereference(priv->peer);
> >> +	if (unlikely(!rcv))
> >> +		return -ENXIO;
> >> +
> >> +	rcv_priv = netdev_priv(rcv);
> >> +	/* xdp_ring is initialized on receive side? */
> >> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
> >> +		err = xdp_ok_fwd_dev(rcv, frame->len);
> >> +		if (unlikely(err))
> >> +			return err;
> >> +
> >> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
> >> +	} else {
> >> +		struct sk_buff *skb;
> >> +
> >> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
> >> +		if (unlikely(!skb))
> >> +			return -ENOMEM;
> >> +
> >> +		/* Get page ref in case skb is dropped in netif_rx.
> >> +		 * The caller is responsible for freeing the page on error.
> >> +		 */
> >> +		get_page(virt_to_page(frame->data));  
> > 
> > I'm not sure you can make this assumption, that xdp_frames coming from
> > another device driver uses a refcnt based memory model. But maybe I'm
> > confused, as this looks like an SKB receive path, but in the
> > ndo_xdp_xmit().  
> 
> I find this path similar to cpumap, which creates skb from redirected
> xdp frame. Once it is converted to skb, skb head is freed by
> page_frag_free, so anyway I needed to get the refcount here regardless
> of memory model.

Yes I know, I wrote cpumap ;-)

First of all, I don't want to see such xdp_frame to SKB conversion code
in every driver.  Because that increase the chances of errors.  And
when looking at the details, then it seems that you have made the
mistake of making it possible to leak xdp_frame info to the SKB (which
cpumap takes into account).

Second, I think the refcnt scheme here is wrong. The xdp_frame should
be "owned" by XDP and have the proper refcnt to deliver it directly to
the network stack.

Third, if we choose that we want a fallback, in-case XDP is not enabled
on egress dev (but it have an ndo_xdp_xmit), then it should be placed
in the generic/core code.  E.g. __bpf_tx_xdp_map() could look at the
return code from dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint,
this would make it easy to implement TX bulking towards the dev).
Toshiaki Makita May 1, 2018, 1:02 a.m. UTC | #4
On 2018/05/01 2:27, Jesper Dangaard Brouer wrote:
> On Thu, 26 Apr 2018 19:52:40 +0900
> Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
> 
>> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:
>>> On Tue, 24 Apr 2018 23:39:20 +0900
>>> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
>>>   
>>>> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
>>>> +{
>>>> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>>>> +	int headroom = frame->data - (void *)frame;
>>>> +	struct net_device *rcv;
>>>> +	int err = 0;
>>>> +
>>>> +	rcv = rcu_dereference(priv->peer);
>>>> +	if (unlikely(!rcv))
>>>> +		return -ENXIO;
>>>> +
>>>> +	rcv_priv = netdev_priv(rcv);
>>>> +	/* xdp_ring is initialized on receive side? */
>>>> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
>>>> +		err = xdp_ok_fwd_dev(rcv, frame->len);
>>>> +		if (unlikely(err))
>>>> +			return err;
>>>> +
>>>> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
>>>> +	} else {
>>>> +		struct sk_buff *skb;
>>>> +
>>>> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
>>>> +		if (unlikely(!skb))
>>>> +			return -ENOMEM;
>>>> +
>>>> +		/* Get page ref in case skb is dropped in netif_rx.
>>>> +		 * The caller is responsible for freeing the page on error.
>>>> +		 */
>>>> +		get_page(virt_to_page(frame->data));  
>>>
>>> I'm not sure you can make this assumption, that xdp_frames coming from
>>> another device driver uses a refcnt based memory model. But maybe I'm
>>> confused, as this looks like an SKB receive path, but in the
>>> ndo_xdp_xmit().  
>>
>> I find this path similar to cpumap, which creates skb from redirected
>> xdp frame. Once it is converted to skb, skb head is freed by
>> page_frag_free, so anyway I needed to get the refcount here regardless
>> of memory model.
> 
> Yes I know, I wrote cpumap ;-)
> 
> First of all, I don't want to see such xdp_frame to SKB conversion code
> in every driver.  Because that increase the chances of errors.  And
> when looking at the details, then it seems that you have made the
> mistake of making it possible to leak xdp_frame info to the SKB (which
> cpumap takes into account).

Do you mean leaving xdp_frame in skb->head is leaking something? how?

> 
> Second, I think the refcnt scheme here is wrong. The xdp_frame should
> be "owned" by XDP and have the proper refcnt to deliver it directly to
> the network stack.
> 
> Third, if we choose that we want a fallback, in-case XDP is not enabled
> on egress dev (but it have an ndo_xdp_xmit), then it should be placed
> in the generic/core code.  E.g. __bpf_tx_xdp_map() could look at the
> return code from dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint,
> this would make it easy to implement TX bulking towards the dev).

Right, this is a much cleaner way.
Although I feel like we should add this fallback for veth because it
requires something which is different from other drivers (enabling XDP
on the peer device of the egress device), I'll drop the part for now. It
should not be resolved in the driver code.
Jesper Dangaard Brouer May 1, 2018, 8:14 a.m. UTC | #5
On Tue, 1 May 2018 10:02:01 +0900
Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:

> On 2018/05/01 2:27, Jesper Dangaard Brouer wrote:
> > On Thu, 26 Apr 2018 19:52:40 +0900
> > Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
> >   
> >> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:  
> >>> On Tue, 24 Apr 2018 23:39:20 +0900
> >>> Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:
> >>>     
> >>>> +static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
> >>>> +{
> >>>> +	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
> >>>> +	int headroom = frame->data - (void *)frame;
> >>>> +	struct net_device *rcv;
> >>>> +	int err = 0;
> >>>> +
> >>>> +	rcv = rcu_dereference(priv->peer);
> >>>> +	if (unlikely(!rcv))
> >>>> +		return -ENXIO;
> >>>> +
> >>>> +	rcv_priv = netdev_priv(rcv);
> >>>> +	/* xdp_ring is initialized on receive side? */
> >>>> +	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
> >>>> +		err = xdp_ok_fwd_dev(rcv, frame->len);
> >>>> +		if (unlikely(err))
> >>>> +			return err;
> >>>> +
> >>>> +		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
> >>>> +	} else {
> >>>> +		struct sk_buff *skb;
> >>>> +
> >>>> +		skb = veth_build_skb(frame, headroom, frame->len, 0);
> >>>> +		if (unlikely(!skb))
> >>>> +			return -ENOMEM;
> >>>> +
> >>>> +		/* Get page ref in case skb is dropped in netif_rx.
> >>>> +		 * The caller is responsible for freeing the page on error.
> >>>> +		 */
> >>>> +		get_page(virt_to_page(frame->data));    
> >>>
> >>> I'm not sure you can make this assumption, that xdp_frames coming from
> >>> another device driver uses a refcnt based memory model. But maybe I'm
> >>> confused, as this looks like an SKB receive path, but in the
> >>> ndo_xdp_xmit().    
> >>
> >> I find this path similar to cpumap, which creates skb from redirected
> >> xdp frame. Once it is converted to skb, skb head is freed by
> >> page_frag_free, so anyway I needed to get the refcount here regardless
> >> of memory model.  
> > 
> > Yes I know, I wrote cpumap ;-)
> > 
> > First of all, I don't want to see such xdp_frame to SKB conversion code
> > in every driver.  Because that increase the chances of errors.  And
> > when looking at the details, then it seems that you have made the
> > mistake of making it possible to leak xdp_frame info to the SKB (which
> > cpumap takes into account).  
> 
> Do you mean leaving xdp_frame in skb->head is leaking something? how?

Like commit 97e19cce05e5 ("bpf: reserve xdp_frame size in xdp headroom")
and commit 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data
on page reuse").

But this time, the concern is a bpf_prog attached at TC/bpf_cls level, that can 
that can adjust head via bpf_skb_change_head (for XDP it is
bpf_xdp_adjust_head) into the area used by xdp_frame.  As described in
https://git.kernel.org/davem/net-next/c/6dfb970d3dbd in is not super
critical at the moment, as this _currently_ runs as CAP_SYS_ADMIN, but
we would like to move towards CAP_NET_ADMIN.

 
> > Second, I think the refcnt scheme here is wrong. The xdp_frame should
> > be "owned" by XDP and have the proper refcnt to deliver it directly to
> > the network stack.
> > 
> > Third, if we choose that we want a fallback, in-case XDP is not enabled
> > on egress dev (but it have an ndo_xdp_xmit), then it should be placed
> > in the generic/core code.  E.g. __bpf_tx_xdp_map() could look at the
> > return code from dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint,
> > this would make it easy to implement TX bulking towards the dev).  
> 
> Right, this is a much cleaner way.
>
> Although I feel like we should add this fallback for veth because it
> requires something which is different from other drivers (enabling XDP
> on the peer device of the egress device), 

(This is why I Cc'ed Tariq...)

This is actually a general problem with the xdp "xmit" side (and not
specific to veth driver). The problem exists for other drivers as well.

The problem is that a driver can implement ndo_xdp_xmit(), but the
driver might not have allocated the necessary XDP TX-queue resources
yet (or it might not be possible due to system resource limits).

The current "hack" is to load an XDP prog on the egress device, and
then assume that this cause the driver to also allocate the XDP
ndo_xdo_xmit side HW resources.  This is IMHO a wrong assumption.

We need a more generic way to test if a net_device is "ready/enabled"
for handling xdp_frames via ndo_xdp_xmit.  And Tariq had some ideas on
how to implement this...

My opinion is that it is a waste of (HW/mem) resources to always
allocate resources for ndo_xdp_xmit when loading an XDP program.
Because what if my use-cases are XDP_DROP DDoS filter, or CPUMAP
redirect load-balancer, then I don't want/need ndo_xdp_xmit.  E.g.
today using ixgbe on machines with more than 96 CPUs, will fail due to
limited TX queue resources. Thus, blocking the mentioned use-cases.


> I'll drop the part for now. It should not be resolved in the driver
> code.

Thank you.
Toshiaki Makita May 2, 2018, 3:33 a.m. UTC | #6
On 18/05/01 (火) 17:14, Jesper Dangaard Brouer wrote:
> On Tue, 1 May 2018 10:02:01 +0900 Toshiaki Makita 
> <makita.toshiaki@lab.ntt.co.jp> wrote:
> 
>> On 2018/05/01 2:27, Jesper Dangaard Brouer wrote:
>>> On Thu, 26 Apr 2018 19:52:40 +0900 Toshiaki Makita 
>>> <makita.toshiaki@lab.ntt.co.jp> wrote:
>>> 
>>>> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:
>>>>> On Tue, 24 Apr 2018 23:39:20 +0900 Toshiaki Makita 
>>>>> <toshiaki.makita1@gmail.com> wrote:
>>>>> 
>>>>>> +static int veth_xdp_xmit(struct net_device *dev, struct 
>>>>>> xdp_frame *frame) +{ +	struct veth_priv *rcv_priv, *priv = 
>>>>>> netdev_priv(dev); +	int headroom = frame->data - (void 
>>>>>> *)frame; +	struct net_device *rcv; +	int err = 0; + +	rcv
>>>>>> = rcu_dereference(priv->peer); +	if (unlikely(!rcv)) + 
>>>>>> return -ENXIO; + +	rcv_priv = netdev_priv(rcv); +	/* 
>>>>>> xdp_ring is initialized on receive side? */ +	if 
>>>>>> (rcu_access_pointer(rcv_priv->xdp_prog)) { +		err = 
>>>>>> xdp_ok_fwd_dev(rcv, frame->len); +		if (unlikely(err)) + 
>>>>>> return err; + +		err = veth_xdp_enqueue(rcv_priv, 
>>>>>> veth_xdp_to_ptr(frame)); +	} else { +		struct sk_buff *skb;
>>>>>> + +		skb = veth_build_skb(frame, headroom, frame->len, 0);
>>>>>> +		if (unlikely(!skb)) +			return -ENOMEM; + +		/* Get page
>>>>>> ref in case skb is dropped in netif_rx. + * The caller is
>>>>>> responsible for freeing the page on error. +		 */ +
>>>>>> get_page(virt_to_page(frame->data));
>>>>> 
>>>>> I'm not sure you can make this assumption, that xdp_frames 
>>>>> coming from another device driver uses a refcnt based memory 
>>>>> model. But maybe I'm confused, as this looks like an SKB 
>>>>> receive path, but in the ndo_xdp_xmit().
>>>> 
>>>> I find this path similar to cpumap, which creates skb from 
>>>> redirected xdp frame. Once it is converted to skb, skb head is 
>>>> freed by page_frag_free, so anyway I needed to get the
>>>> refcount here regardless of memory model.
>>> 
>>> Yes I know, I wrote cpumap ;-)
>>> 
>>> First of all, I don't want to see such xdp_frame to SKB 
>>> conversion code in every driver.  Because that increase the 
>>> chances of errors.  And when looking at the details, then it 
>>> seems that you have made the mistake of making it possible to 
>>> leak xdp_frame info to the SKB (which cpumap takes into 
>>> account).
>> 
>> Do you mean leaving xdp_frame in skb->head is leaking something? 
>> how?
> 
> Like commit 97e19cce05e5 ("bpf: reserve xdp_frame size in xdp 
> headroom") and commit 6dfb970d3dbd ("xdp: avoid leaking info stored 
> in frame data on page reuse").

Thanks for sharing the info.

> But this time, the concern is a bpf_prog attached at TC/bpf_cls 
> level, that can that can adjust head via bpf_skb_change_head (for
> XDP it is bpf_xdp_adjust_head) into the area used by xdp_frame.  As 
> described in https://git.kernel.org/davem/net-next/c/6dfb970d3dbd in 
> is not super critical at the moment, as this _currently_ runs as 
> CAP_SYS_ADMIN, but we would like to move towards CAP_NET_ADMIN.

What I don't get is why special casing xdp_frame info. My assumption is
that the area above skb->mac_header is uninit kernel memory and should
not be readable by unprivileged users anyway. So I didn't clear the area
at this point.

>>> Second, I think the refcnt scheme here is wrong. The xdp_frame 
>>> should be "owned" by XDP and have the proper refcnt to deliver
>>> it directly to the network stack.
>>> 
>>> Third, if we choose that we want a fallback, in-case XDP is not 
>>> enabled on egress dev (but it have an ndo_xdp_xmit), then it 
>>> should be placed in the generic/core code.  E.g. 
>>> __bpf_tx_xdp_map() could look at the return code from 
>>> dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint, this would 
>>> make it easy to implement TX bulking towards the dev).
>> 
>> Right, this is a much cleaner way.
>> 
>> Although I feel like we should add this fallback for veth because 
>> it requires something which is different from other drivers 
>> (enabling XDP on the peer device of the egress device),
> 
> (This is why I Cc'ed Tariq...)
> 
> This is actually a general problem with the xdp "xmit" side (and not
>  specific to veth driver). The problem exists for other drivers as 
> well.
> 
> The problem is that a driver can implement ndo_xdp_xmit(), but the 
> driver might not have allocated the necessary XDP TX-queue resources
>  yet (or it might not be possible due to system resource limits).
> 
> The current "hack" is to load an XDP prog on the egress device, and 
> then assume that this cause the driver to also allocate the XDP 
> ndo_xdo_xmit side HW resources.  This is IMHO a wrong assumption.
 >
> We need a more generic way to test if a net_device is "ready/enabled"
> for handling xdp_frames via ndo_xdp_xmit.  And Tariq had some ideas
> on how to implement this...

My assumption on REDIRECT requirement came from this.
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=306da4e685b

I guess you are saying thing are changing, and having an XDP program 
attached on the egress device is no longer generally sufficient. Looking 
forward to Tariq's solution.

Toshiaki Makita

> 
> My opinion is that it is a waste of (HW/mem) resources to always 
> allocate resources for ndo_xdp_xmit when loading an XDP program. 
> Because what if my use-cases are XDP_DROP DDoS filter, or CPUMAP 
> redirect load-balancer, then I don't want/need ndo_xdp_xmit.  E.g. 
> today using ixgbe on machines with more than 96 CPUs, will fail due 
> to limited TX queue resources. Thus, blocking the mentioned 
> use-cases.
> 
> 
>> I'll drop the part for now. It should not be resolved in the driver
>> code.
> 
> Thank you.
>
Jesper Dangaard Brouer May 2, 2018, 5:56 a.m. UTC | #7
On Wed, 2 May 2018 12:33:47 +0900
Toshiaki Makita <toshiaki.makita1@gmail.com> wrote:

> On 18/05/01 (火) 17:14, Jesper Dangaard Brouer wrote:
> > On Tue, 1 May 2018 10:02:01 +0900 Toshiaki Makita 
> > <makita.toshiaki@lab.ntt.co.jp> wrote:
> >   
> >> On 2018/05/01 2:27, Jesper Dangaard Brouer wrote:  
> >>> On Thu, 26 Apr 2018 19:52:40 +0900 Toshiaki Makita 
> >>> <makita.toshiaki@lab.ntt.co.jp> wrote:
> >>>   
> >>>> On 2018/04/26 5:24, Jesper Dangaard Brouer wrote:  
> >>>>> On Tue, 24 Apr 2018 23:39:20 +0900 Toshiaki Makita 
> >>>>> <toshiaki.makita1@gmail.com> wrote:
> >>>>>   
> >>>>>> +static int veth_xdp_xmit(struct net_device *dev, struct 
[...]
> >>>>> 
> >>>>> I'm not sure you can make this assumption, that xdp_frames 
> >>>>> coming from another device driver uses a refcnt based memory 
> >>>>> model. But maybe I'm confused, as this looks like an SKB 
> >>>>> receive path, but in the ndo_xdp_xmit().  
> >>>> 
> >>>> I find this path similar to cpumap, which creates skb from 
> >>>> redirected xdp frame. Once it is converted to skb, skb head is 
> >>>> freed by page_frag_free, so anyway I needed to get the
> >>>> refcount here regardless of memory model.  
> >>> 
> >>> Yes I know, I wrote cpumap ;-)
> >>> 
> >>> First of all, I don't want to see such xdp_frame to SKB 
> >>> conversion code in every driver.  Because that increase the 
> >>> chances of errors.  And when looking at the details, then it 
> >>> seems that you have made the mistake of making it possible to 
> >>> leak xdp_frame info to the SKB (which cpumap takes into 
> >>> account).  
> >> 
> >> Do you mean leaving xdp_frame in skb->head is leaking something? 
> >> how?  
> > 
> > Like commit 97e19cce05e5 ("bpf: reserve xdp_frame size in xdp 
> > headroom") and commit 6dfb970d3dbd ("xdp: avoid leaking info stored 
> > in frame data on page reuse").  
> 
> Thanks for sharing the info.
> 
> > But this time, the concern is a bpf_prog attached at TC/bpf_cls 
> > level, that can that can adjust head via bpf_skb_change_head (for
> > XDP it is bpf_xdp_adjust_head) into the area used by xdp_frame.  As 
> > described in https://git.kernel.org/davem/net-next/c/6dfb970d3dbd in 
> > is not super critical at the moment, as this _currently_ runs as 
> > CAP_SYS_ADMIN, but we would like to move towards CAP_NET_ADMIN.  
> 
> What I don't get is why special casing xdp_frame info. My assumption is
> that the area above skb->mac_header is uninit kernel memory and should
> not be readable by unprivileged users anyway. So I didn't clear the area
> at this point.

This is also my understanding. But Alexei explicitly asked me to handle
this xdp_frame info case.  I assume that more work is required for the
transition from CAP_SYS_ADMIN to CAP_NET_ADMIN, we just don't want to
add more/new code that makes this more difficult.


> >>> Second, I think the refcnt scheme here is wrong. The xdp_frame 
> >>> should be "owned" by XDP and have the proper refcnt to deliver
> >>> it directly to the network stack.
> >>> 
> >>> Third, if we choose that we want a fallback, in-case XDP is not 
> >>> enabled on egress dev (but it have an ndo_xdp_xmit), then it 
> >>> should be placed in the generic/core code.  E.g. 
> >>> __bpf_tx_xdp_map() could look at the return code from 
> >>> dev->netdev_ops->ndo_xdp() and create an SKB.  (Hint, this would 
> >>> make it easy to implement TX bulking towards the dev).  
> >> 
> >> Right, this is a much cleaner way.
> >> 
> >> Although I feel like we should add this fallback for veth because 
> >> it requires something which is different from other drivers 
> >> (enabling XDP on the peer device of the egress device),  
> > 
> > (This is why I Cc'ed Tariq...)
> > 
> > This is actually a general problem with the xdp "xmit" side (and not
> >  specific to veth driver). The problem exists for other drivers as 
> > well.
> > 
> > The problem is that a driver can implement ndo_xdp_xmit(), but the 
> > driver might not have allocated the necessary XDP TX-queue resources
> >  yet (or it might not be possible due to system resource limits).
> > 
> > The current "hack" is to load an XDP prog on the egress device, and 
> > then assume that this cause the driver to also allocate the XDP 
> > ndo_xdo_xmit side HW resources.  This is IMHO a wrong assumption.
>  >
> > We need a more generic way to test if a net_device is "ready/enabled"
> > for handling xdp_frames via ndo_xdp_xmit.  And Tariq had some ideas
> > on how to implement this...  
> 
> My assumption on REDIRECT requirement came from this.
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=306da4e685b

Yes, I wrote that.

> I guess you are saying thing are changing, and having an XDP program 
> attached on the egress device is no longer generally sufficient. Looking 
> forward to Tariq's solution.

Yes, (hopefully) things are changing. Loading a dummy XDP program to
enable ndo_xdp_xmit, turned out to be a bad model, for all the reasons
I listed.

I hope Tariq find some time to work on this ... ;-)


> Toshiaki Makita
> 
> > 
> > My opinion is that it is a waste of (HW/mem) resources to always 
> > allocate resources for ndo_xdp_xmit when loading an XDP program. 
> > Because what if my use-cases are XDP_DROP DDoS filter, or CPUMAP 
> > redirect load-balancer, then I don't want/need ndo_xdp_xmit.  E.g. 
> > today using ixgbe on machines with more than 96 CPUs, will fail due 
> > to limited TX queue resources. Thus, blocking the mentioned 
> > use-cases.
> >
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 89c91c1c9935..b1d591be0eba 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -54,6 +54,11 @@  static bool veth_is_xdp_frame(void *ptr)
 	return (unsigned long)ptr & VETH_XDP_FLAG;
 }
 
+static void *veth_xdp_to_ptr(void *ptr)
+{
+	return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
+}
+
 static void *veth_ptr_to_xdp(void *ptr)
 {
 	return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
@@ -138,7 +143,7 @@  static void veth_ptr_free(void *ptr)
 	}
 }
 
-static void veth_xdp_flush(struct veth_priv *priv)
+static void __veth_xdp_flush(struct veth_priv *priv)
 {
 	/* Write ptr_ring before reading rx_notify_masked */
 	smp_mb();
@@ -206,7 +211,7 @@  static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	/* TODO: check xmit_more and tx_stopped */
 	if (rcv_xdp)
-		veth_xdp_flush(rcv_priv);
+		__veth_xdp_flush(rcv_priv);
 
 	rcu_read_unlock();
 
@@ -281,6 +286,66 @@  static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
 	return skb;
 }
 
+static int veth_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+{
+	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+	int headroom = frame->data - (void *)frame;
+	struct net_device *rcv;
+	int err = 0;
+
+	rcv = rcu_dereference(priv->peer);
+	if (unlikely(!rcv))
+		return -ENXIO;
+
+	rcv_priv = netdev_priv(rcv);
+	/* xdp_ring is initialized on receive side? */
+	if (rcu_access_pointer(rcv_priv->xdp_prog)) {
+		err = xdp_ok_fwd_dev(rcv, frame->len);
+		if (unlikely(err))
+			return err;
+
+		err = veth_xdp_enqueue(rcv_priv, veth_xdp_to_ptr(frame));
+	} else {
+		struct sk_buff *skb;
+
+		skb = veth_build_skb(frame, headroom, frame->len, 0);
+		if (unlikely(!skb))
+			return -ENOMEM;
+
+		/* Get page ref in case skb is dropped in netif_rx.
+		 * The caller is responsible for freeing the page on error.
+		 */
+		get_page(virt_to_page(frame->data));
+		if (unlikely(veth_forward_skb(rcv, skb, false) != NET_RX_SUCCESS))
+			return -ENXIO;
+
+		/* Put page ref on success */
+		page_frag_free(frame->data);
+	}
+
+	return err;
+}
+
+static void veth_xdp_flush(struct net_device *dev)
+{
+	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
+	struct net_device *rcv;
+
+	rcu_read_lock();
+	rcv = rcu_dereference(priv->peer);
+	if (unlikely(!rcv))
+		goto out;
+
+	rcv_priv = netdev_priv(rcv);
+	/* xdp_ring is initialized on receive side? */
+	if (unlikely(!rcu_access_pointer(rcv_priv->xdp_prog)))
+		goto out;
+
+	__veth_xdp_flush(rcv_priv);
+out:
+	rcu_read_unlock();
+}
+
 static struct sk_buff *veth_xdp_rcv_one(struct veth_priv *priv,
 					struct xdp_frame *frame)
 {
@@ -580,7 +645,7 @@  static void veth_poll_controller(struct net_device *dev)
 
 	rcu_read_lock();
 	if (rcu_access_pointer(priv->xdp_prog))
-		veth_xdp_flush(priv);
+		__veth_xdp_flush(priv);
 	rcu_read_unlock();
 }
 #endif	/* CONFIG_NET_POLL_CONTROLLER */
@@ -730,6 +795,8 @@  static const struct net_device_ops veth_netdev_ops = {
 	.ndo_features_check	= passthru_features_check,
 	.ndo_set_rx_headroom	= veth_set_rx_headroom,
 	.ndo_bpf		= veth_xdp,
+	.ndo_xdp_xmit		= veth_xdp_xmit,
+	.ndo_xdp_flush		= veth_xdp_flush,
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b2308174..7d043f51d1d7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -19,6 +19,7 @@ 
 #include <linux/cryptohash.h>
 #include <linux/set_memory.h>
 #include <linux/kallsyms.h>
+#include <linux/if_vlan.h>
 
 #include <net/sch_generic.h>
 
@@ -752,6 +753,21 @@  static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
+static __always_inline int
+xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (pktlen > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 /* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the
  * same cpu context. Further for best results no more than a single map
  * for the do_redirect/do_flush pair should be used. This limitation is
diff --git a/net/core/filter.c b/net/core/filter.c
index a374b8560bc4..25ae8ffaa968 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2923,16 +2923,7 @@  EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
 static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
 {
-	unsigned int len;
-
-	if (unlikely(!(fwd->flags & IFF_UP)))
-		return -ENETDOWN;
-
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len)
-		return -EMSGSIZE;
-
-	return 0;
+	return xdp_ok_fwd_dev(fwd, skb->len);
 }
 
 static int xdp_do_generic_redirect_map(struct net_device *dev,