diff mbox

[7/9] net: add skb_orphan_frags to copy aside frags with destructors

Message ID 1336056971-7839-7-git-send-email-ian.campbell@citrix.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Ian Campbell May 3, 2012, 2:56 p.m. UTC
This should be used by drivers which need to hold on to an skb for an extended
(perhaps unbounded) period of time. e.g. the tun driver which relies on
userspace consuming the skb.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: mst@redhat.com
---
 drivers/net/tun.c      |    1 +
 include/linux/skbuff.h |   11 ++++++++
 net/core/skbuff.c      |   68 ++++++++++++++++++++++++++++++++++-------------
 3 files changed, 61 insertions(+), 19 deletions(-)

Comments

Michael S. Tsirkin May 3, 2012, 3:41 p.m. UTC | #1
On Thu, May 03, 2012 at 03:56:09PM +0100, Ian Campbell wrote:
> This should be used by drivers which need to hold on to an skb for an extended
> (perhaps unbounded) period of time. e.g. the tun driver which relies on
> userspace consuming the skb.
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> Cc: mst@redhat.com


Right. But local sockets queue at socket forever as well.
I think this should be called in skb_set_owner_r?

This might somewhat penalize speed for local clients in the name
of correctness but these are rare so being correct is
more important I think.

Also, mactap can get this when running in bridge mode, right?

> ---
>  drivers/net/tun.c      |    1 +
>  include/linux/skbuff.h |   11 ++++++++
>  net/core/skbuff.c      |   68 ++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 61 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bb8c72c..b53e04e 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -415,6 +415,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  	/* Orphan the skb - required as we might hang on to it
>  	 * for indefinite time. */
>  	skb_orphan(skb);
> +	skb_orphan_frags(skb, GFP_KERNEL);
>  
>  	/* Enqueue packet */
>  	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index ccc7d93..9145f83 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1711,6 +1711,17 @@ static inline void skb_orphan(struct sk_buff *skb)
>  }
>  
>  /**
> + *	skb_orphan_frags - orphan the frags contained in a buffer
> + *	@skb: buffer to orphan frags from
> + *	@gfp_mask: allocation mask for replacement pages
> + *
> + *	For each frag in the SKB which has a destructor (i.e. has an
> + *	owner) create a copy of that frag and release the original
> + *	page by calling the destructor.
> + */
> +extern int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask);
> +
> +/**
>   *	__skb_queue_purge - empty a list
>   *	@list: list to empty
>   *
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 945b807..f009abb 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -697,31 +697,25 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
>  }
>  EXPORT_SYMBOL_GPL(skb_morph);
>  
> -/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
> - *	@skb: the skb to modify
> - *	@gfp_mask: allocation priority
> - *
> - *	This must be called on SKBTX_DEV_ZEROCOPY skb.
> - *	It will copy all frags into kernel and drop the reference
> - *	to userspace pages.
> - *
> - *	If this function is called from an interrupt gfp_mask() must be
> - *	%GFP_ATOMIC.
> - *
> - *	Returns 0 on success or a negative error code on failure
> - *	to allocate kernel memory to copy to.
> +/*
> + * If uarg != NULL copy and replace all frags.
> + * If uarg == NULL then only copy and replace those which have a destructor
> + * pointer.
>   */
> -int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> +static int skb_copy_frags(struct sk_buff *skb, gfp_t gfp_mask,
> +			  struct ubuf_info *uarg)
>  {
>  	int i;
>  	int num_frags = skb_shinfo(skb)->nr_frags;
>  	struct page *page, *head = NULL;
> -	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
>  
>  	for (i = 0; i < num_frags; i++) {
>  		u8 *vaddr;
>  		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
>  
> +		if (!uarg && !f->page.destructor)
> +			continue;
> +
>  		page = alloc_page(GFP_ATOMIC);
>  		if (!page) {
>  			while (head) {
> @@ -739,11 +733,16 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
>  		head = page;
>  	}
>  
> -	/* skb frags release userspace buffers */
> -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +	/* skb frags release buffers */
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> +		if (!uarg && !f->page.destructor)
> +			continue;
>  		skb_frag_unref(skb, i);
> +	}
>  
> -	uarg->callback(uarg);
> +	if (uarg)
> +		uarg->callback(uarg);
>  
>  	/* skb frags point to kernel buffers */
>  	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
> @@ -752,10 +751,41 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
>  		head = (struct page *)head->private;
>  	}
>  
> -	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
>  	return 0;
>  }
>  
> +/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
> + *	@skb: the skb to modify
> + *	@gfp_mask: allocation priority
> + *
> + *	This must be called on SKBTX_DEV_ZEROCOPY skb.
> + *	It will copy all frags into kernel and drop the reference
> + *	to userspace pages.
> + *
> + *	If this function is called from an interrupt gfp_mask() must be
> + *	%GFP_ATOMIC.
> + *
> + *	Returns 0 on success or a negative error code on failure
> + *	to allocate kernel memory to copy to.
> + */
> +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> +{
> +	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
> +	int rc;
> +
> +	rc = skb_copy_frags(skb, gfp_mask, uarg);
> +
> +	if (rc == 0)
> +		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
> +
> +	return rc;
> +}
> +
> +int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
> +{
> +	return skb_copy_frags(skb, gfp_mask, NULL);
> +}
> +EXPORT_SYMBOL(skb_orphan_frags);
>  
>  /**
>   *	skb_clone	-	duplicate an sk_buff
> -- 
> 1.7.2.5
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 3, 2012, 5:55 p.m. UTC | #2
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 3 May 2012 18:41:43 +0300

> On Thu, May 03, 2012 at 03:56:09PM +0100, Ian Campbell wrote:
>> This should be used by drivers which need to hold on to an skb for an extended
>> (perhaps unbounded) period of time. e.g. the tun driver which relies on
>> userspace consuming the skb.
>> 
>> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
>> Cc: mst@redhat.com
> 
> 
> Right. But local sockets queue at socket forever as well.
> I think this should be called in skb_set_owner_r?
> 
> This might somewhat penalize speed for local clients in the name
> of correctness but these are rare so being correct is
> more important I think.

But, on the other hand, putting the check into skb_set_owner_r() is a
not so nice test to have in the fast path of every socket receive.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin May 3, 2012, 9:10 p.m. UTC | #3
On Thu, May 03, 2012 at 01:55:32PM -0400, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Thu, 3 May 2012 18:41:43 +0300
> 
> > On Thu, May 03, 2012 at 03:56:09PM +0100, Ian Campbell wrote:
> >> This should be used by drivers which need to hold on to an skb for an extended
> >> (perhaps unbounded) period of time. e.g. the tun driver which relies on
> >> userspace consuming the skb.
> >> 
> >> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> >> Cc: mst@redhat.com
> > 
> > 
> > Right. But local sockets queue at socket forever as well.
> > I think this should be called in skb_set_owner_r?
> > 
> > This might somewhat penalize speed for local clients in the name
> > of correctness but these are rare so being correct is
> > more important I think.
> 
> But, on the other hand, putting the check into skb_set_owner_r() is a
> not so nice test to have in the fast path of every socket receive.

True.  Hmm we orphan skbs when we loop them back
so how about reusing the skb->destructor for this?

We could teach pskb_copy pskb_expand_head etc that
when skb with this flag is cloned (expand head etc)
destructor would be set to a function that copies frags.
(clone is less of a fast path so I think adding
 a branch there is less of an issue).

Of course destructor is also called from kfree_skb
but we could clear this flag before the call
in kfree_skb so that destructor can distinguish.
David Miller May 4, 2012, 6:54 a.m. UTC | #4
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 4 May 2012 00:10:24 +0300

> Hmm we orphan skbs when we loop them back so how about reusing the
> skb->destructor for this?

That's one possibility.

But I fear we're about to toss Ian into yet another rabbit hole. :-)

Let's try to converge on something quickly as I think integration of
his work has been delayed enough as-is.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin May 4, 2012, 10:03 a.m. UTC | #5
On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Fri, 4 May 2012 00:10:24 +0300
> 
> > Hmm we orphan skbs when we loop them back so how about reusing the
> > skb->destructor for this?
> 
> That's one possibility.
> 
> But I fear we're about to toss Ian into yet another rabbit hole. :-)
> 
> Let's try to converge on something quickly as I think integration of
> his work has been delayed enough as-is.

This is more intrusive than I'd like because on
error we need to free the skb, so need
to return error code from orphan, but it does not add
an extra branch since skb_orphan is inline.
So intrusive but should not hurt fast path.

It's weekend here, I'll work on a patch like this
Sunday.
Ian Campbell May 4, 2012, 10:51 a.m. UTC | #6
On Fri, 2012-05-04 at 11:03 +0100, Michael S. Tsirkin wrote:
> On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> > From: "Michael S. Tsirkin" <mst@redhat.com>
> > Date: Fri, 4 May 2012 00:10:24 +0300
> > 
> > > Hmm we orphan skbs when we loop them back so how about reusing the
> > > skb->destructor for this?
> > 
> > That's one possibility.
> > 
> > But I fear we're about to toss Ian into yet another rabbit hole. :-)
> > 
> > Let's try to converge on something quickly as I think integration of
> > his work has been delayed enough as-is.
> 
> This is more intrusive than I'd like because on
> error we need to free the skb, so need
> to return error code from orphan, but it does not add
> an extra branch since skb_orphan is inline.
> So intrusive but should not hurt fast path.
> 
> It's weekend here, I'll work on a patch like this
> Sunday.

Thanks, I was starting to feel my nose twitching and my ears beginning
to elongate ;-)

Ian.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin May 6, 2012, 1:54 p.m. UTC | #7
On Thu, May 03, 2012 at 03:56:09PM +0100, Ian Campbell wrote:
> This should be used by drivers which need to hold on to an skb for an extended
> (perhaps unbounded) period of time. e.g. the tun driver which relies on
> userspace consuming the skb.
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> Cc: mst@redhat.com
> ---
>  drivers/net/tun.c      |    1 +
>  include/linux/skbuff.h |   11 ++++++++
>  net/core/skbuff.c      |   68 ++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 61 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bb8c72c..b53e04e 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -415,6 +415,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  	/* Orphan the skb - required as we might hang on to it
>  	 * for indefinite time. */
>  	skb_orphan(skb);
> +	skb_orphan_frags(skb, GFP_KERNEL);
>  
>  	/* Enqueue packet */
>  	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);


BTW, didn't notice at the moment but there seem to be
a couple of other problems:

1. this is using GFP_KERNEL on xmit path.
2. And it's not just a question of passing in GFP_ATOMIC:
return status needs to be checked.
3. Scanning all frags just in case one of them has
a descructor also won't help performace :(
Michael S. Tsirkin May 6, 2012, 5:01 p.m. UTC | #8
On Fri, May 04, 2012 at 02:54:33AM -0400, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Fri, 4 May 2012 00:10:24 +0300
> 
> > Hmm we orphan skbs when we loop them back so how about reusing the
> > skb->destructor for this?
> 
> That's one possibility.
> 
> But I fear we're about to toss Ian into yet another rabbit hole. :-)
> 
> Let's try to converge on something quickly as I think integration of
> his work has been delayed enough as-is.

OK I tried doing this and I recalled why we
do the copy with ubufs before clone:
the problem is that shinfo is shared between skbs,
so modifying frags like skb_orphan_frags does is racy.
Stuck for now.

So I have a question: how about reusing the TX_DEV_ZEROCOPY
machinery for this, instead of frag destructors?

Thanks,
Michael S. Tsirkin May 7, 2012, 10:24 a.m. UTC | #9
On Thu, May 03, 2012 at 03:56:09PM +0100, Ian Campbell wrote:
> This should be used by drivers which need to hold on to an skb for an extended
> (perhaps unbounded) period of time. e.g. the tun driver which relies on
> userspace consuming the skb.
> 
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> Cc: mst@redhat.com
> ---
>  drivers/net/tun.c      |    1 +
>  include/linux/skbuff.h |   11 ++++++++
>  net/core/skbuff.c      |   68 ++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 61 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index bb8c72c..b53e04e 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -415,6 +415,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
>  	/* Orphan the skb - required as we might hang on to it
>  	 * for indefinite time. */
>  	skb_orphan(skb);
> +	skb_orphan_frags(skb, GFP_KERNEL);
>  
>  	/* Enqueue packet */
>  	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index ccc7d93..9145f83 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -1711,6 +1711,17 @@ static inline void skb_orphan(struct sk_buff *skb)
>  }
>  
>  /**
> + *	skb_orphan_frags - orphan the frags contained in a buffer
> + *	@skb: buffer to orphan frags from
> + *	@gfp_mask: allocation mask for replacement pages
> + *
> + *	For each frag in the SKB which has a destructor (i.e. has an
> + *	owner) create a copy of that frag and release the original
> + *	page by calling the destructor.
> + */
> +extern int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask);
> +
> +/**
>   *	__skb_queue_purge - empty a list
>   *	@list: list to empty
>   *
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 945b807..f009abb 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -697,31 +697,25 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
>  }
>  EXPORT_SYMBOL_GPL(skb_morph);
>  
> -/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
> - *	@skb: the skb to modify
> - *	@gfp_mask: allocation priority
> - *
> - *	This must be called on SKBTX_DEV_ZEROCOPY skb.
> - *	It will copy all frags into kernel and drop the reference
> - *	to userspace pages.
> - *
> - *	If this function is called from an interrupt gfp_mask() must be
> - *	%GFP_ATOMIC.
> - *
> - *	Returns 0 on success or a negative error code on failure
> - *	to allocate kernel memory to copy to.
> +/*
> + * If uarg != NULL copy and replace all frags.
> + * If uarg == NULL then only copy and replace those which have a destructor
> + * pointer.
>   */
> -int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> +static int skb_copy_frags(struct sk_buff *skb, gfp_t gfp_mask,
> +			  struct ubuf_info *uarg)
>  {
>  	int i;
>  	int num_frags = skb_shinfo(skb)->nr_frags;
>  	struct page *page, *head = NULL;
> -	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
>  
>  	for (i = 0; i < num_frags; i++) {
>  		u8 *vaddr;
>  		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
>  
> +		if (!uarg && !f->page.destructor)
> +			continue;
> +
>  		page = alloc_page(GFP_ATOMIC);
>  		if (!page) {
>  			while (head) {
> @@ -739,11 +733,16 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
>  		head = page;
>  	}
>  
> -	/* skb frags release userspace buffers */
> -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +	/* skb frags release buffers */
> +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> +		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> +		if (!uarg && !f->page.destructor)
> +			continue;
>  		skb_frag_unref(skb, i);
> +	}
>  
> -	uarg->callback(uarg);
> +	if (uarg)
> +		uarg->callback(uarg);
>  

So above we only linked up copied pages, but below we
try to use the list for all frags. Looks like a bug,
I think it needs to check destructor and uarg too.


>  	/* skb frags point to kernel buffers */
>  	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
> @@ -752,10 +751,41 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
>  		head = (struct page *)head->private;
>  	}
>  
> -	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
>  	return 0;
>  }
>  
> +/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
> + *	@skb: the skb to modify
> + *	@gfp_mask: allocation priority
> + *
> + *	This must be called on SKBTX_DEV_ZEROCOPY skb.
> + *	It will copy all frags into kernel and drop the reference
> + *	to userspace pages.
> + *
> + *	If this function is called from an interrupt gfp_mask() must be
> + *	%GFP_ATOMIC.
> + *
> + *	Returns 0 on success or a negative error code on failure
> + *	to allocate kernel memory to copy to.
> + */
> +int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> +{
> +	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
> +	int rc;
> +
> +	rc = skb_copy_frags(skb, gfp_mask, uarg);
> +
> +	if (rc == 0)
> +		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
> +
> +	return rc;
> +}
> +
> +int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
> +{
> +	return skb_copy_frags(skb, gfp_mask, NULL);
> +}
> +EXPORT_SYMBOL(skb_orphan_frags);
>  
>  /**
>   *	skb_clone	-	duplicate an sk_buff
> -- 
> 1.7.2.5
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ian Campbell May 9, 2012, 9:36 a.m. UTC | #10
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 945b807..f009abb 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -697,31 +697,25 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
> >  }
> >  EXPORT_SYMBOL_GPL(skb_morph);
> >  
> > -/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
> > - *	@skb: the skb to modify
> > - *	@gfp_mask: allocation priority
> > - *
> > - *	This must be called on SKBTX_DEV_ZEROCOPY skb.
> > - *	It will copy all frags into kernel and drop the reference
> > - *	to userspace pages.
> > - *
> > - *	If this function is called from an interrupt gfp_mask() must be
> > - *	%GFP_ATOMIC.
> > - *
> > - *	Returns 0 on success or a negative error code on failure
> > - *	to allocate kernel memory to copy to.
> > +/*
> > + * If uarg != NULL copy and replace all frags.
> > + * If uarg == NULL then only copy and replace those which have a destructor
> > + * pointer.
> >   */
> > -int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> > +static int skb_copy_frags(struct sk_buff *skb, gfp_t gfp_mask,
> > +			  struct ubuf_info *uarg)
> >  {
> >  	int i;
> >  	int num_frags = skb_shinfo(skb)->nr_frags;
> >  	struct page *page, *head = NULL;
> > -	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
> >  
> >  	for (i = 0; i < num_frags; i++) {
> >  		u8 *vaddr;
> >  		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> >  
> > +		if (!uarg && !f->page.destructor)
> > +			continue;
> > +
> >  		page = alloc_page(GFP_ATOMIC);
> >  		if (!page) {
> >  			while (head) {
> > @@ -739,11 +733,16 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> >  		head = page;
> >  	}
> >  
> > -	/* skb frags release userspace buffers */
> > -	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> > +	/* skb frags release buffers */
> > +	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
> > +		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> > +		if (!uarg && !f->page.destructor)
> > +			continue;
> >  		skb_frag_unref(skb, i);
> > +	}
> >  
> > -	uarg->callback(uarg);
> > +	if (uarg)
> > +		uarg->callback(uarg);
> >  
> 
> So above we only linked up copied pages, but below we
> try to use the list for all frags. Looks like a bug,
> I think it needs to check destructor and uarg too.

You are absolutely correct. We need the same check in all three loops.

> 
> 
> >  	/* skb frags point to kernel buffers */
> >  	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
> > @@ -752,10 +751,41 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> >  		head = (struct page *)head->private;
> >  	}
> >  
> > -	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
> >  	return 0;
> >  }


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index bb8c72c..b53e04e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -415,6 +415,7 @@  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Orphan the skb - required as we might hang on to it
 	 * for indefinite time. */
 	skb_orphan(skb);
+	skb_orphan_frags(skb, GFP_KERNEL);
 
 	/* Enqueue packet */
 	skb_queue_tail(&tun->socket.sk->sk_receive_queue, skb);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ccc7d93..9145f83 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1711,6 +1711,17 @@  static inline void skb_orphan(struct sk_buff *skb)
 }
 
 /**
+ *	skb_orphan_frags - orphan the frags contained in a buffer
+ *	@skb: buffer to orphan frags from
+ *	@gfp_mask: allocation mask for replacement pages
+ *
+ *	For each frag in the SKB which has a destructor (i.e. has an
+ *	owner) create a copy of that frag and release the original
+ *	page by calling the destructor.
+ */
+extern int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask);
+
+/**
  *	__skb_queue_purge - empty a list
  *	@list: list to empty
  *
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 945b807..f009abb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -697,31 +697,25 @@  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
-/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
- *	@skb: the skb to modify
- *	@gfp_mask: allocation priority
- *
- *	This must be called on SKBTX_DEV_ZEROCOPY skb.
- *	It will copy all frags into kernel and drop the reference
- *	to userspace pages.
- *
- *	If this function is called from an interrupt gfp_mask() must be
- *	%GFP_ATOMIC.
- *
- *	Returns 0 on success or a negative error code on failure
- *	to allocate kernel memory to copy to.
+/*
+ * If uarg != NULL copy and replace all frags.
+ * If uarg == NULL then only copy and replace those which have a destructor
+ * pointer.
  */
-int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+static int skb_copy_frags(struct sk_buff *skb, gfp_t gfp_mask,
+			  struct ubuf_info *uarg)
 {
 	int i;
 	int num_frags = skb_shinfo(skb)->nr_frags;
 	struct page *page, *head = NULL;
-	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
 
 	for (i = 0; i < num_frags; i++) {
 		u8 *vaddr;
 		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
 
+		if (!uarg && !f->page.destructor)
+			continue;
+
 		page = alloc_page(GFP_ATOMIC);
 		if (!page) {
 			while (head) {
@@ -739,11 +733,16 @@  int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 		head = page;
 	}
 
-	/* skb frags release userspace buffers */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+	/* skb frags release buffers */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+		if (!uarg && !f->page.destructor)
+			continue;
 		skb_frag_unref(skb, i);
+	}
 
-	uarg->callback(uarg);
+	if (uarg)
+		uarg->callback(uarg);
 
 	/* skb frags point to kernel buffers */
 	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
@@ -752,10 +751,41 @@  int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
 		head = (struct page *)head->private;
 	}
 
-	skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
 	return 0;
 }
 
+/*	skb_copy_ubufs	-	copy userspace skb frags buffers to kernel
+ *	@skb: the skb to modify
+ *	@gfp_mask: allocation priority
+ *
+ *	This must be called on SKBTX_DEV_ZEROCOPY skb.
+ *	It will copy all frags into kernel and drop the reference
+ *	to userspace pages.
+ *
+ *	If this function is called from an interrupt gfp_mask() must be
+ *	%GFP_ATOMIC.
+ *
+ *	Returns 0 on success or a negative error code on failure
+ *	to allocate kernel memory to copy to.
+ */
+int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	struct ubuf_info *uarg = skb_shinfo(skb)->destructor_arg;
+	int rc;
+
+	rc = skb_copy_frags(skb, gfp_mask, uarg);
+
+	if (rc == 0)
+		skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+
+	return rc;
+}
+
+int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	return skb_copy_frags(skb, gfp_mask, NULL);
+}
+EXPORT_SYMBOL(skb_orphan_frags);
 
 /**
  *	skb_clone	-	duplicate an sk_buff