diff mbox

[net-next-2.6] net: pskb_expand_head() optimization

Message ID 20100910.125449.235704956.davem@davemloft.net
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

David Miller Sept. 10, 2010, 7:54 p.m. UTC
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 07 Sep 2010 11:37:28 +0200

> Le mardi 07 septembre 2010 à 09:16 +0000, Jarek Poplawski a écrit :
>> On 2010-09-07 07:02, Eric Dumazet wrote:
> 
>> > 
>> > I understand what you want to do, but problem is we need to perform a
>> > CAS2 operation : atomically changes two values (dataref and frag_list)
>> 
>> Alas I can't understand why do you think these clone and atomic tests
>> in skb_release_data() don't protect skb_shinfo(skb)->frag_list enough.
>> 
> 
> It was early in the morning, before a cup of tea.
> 
> David only had to set frag_list in the new shinfo, not the old.

Ok, how does this look?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Jarek Poplawski Sept. 11, 2010, 12:31 p.m. UTC | #1
On Fri, Sep 10, 2010 at 12:54:49PM -0700, David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 07 Sep 2010 11:37:28 +0200
> 
> > Le mardi 07 septembre 2010 ? 09:16 +0000, Jarek Poplawski a écrit :
> >> On 2010-09-07 07:02, Eric Dumazet wrote:
> > 
> >> > 
> >> > I understand what you want to do, but problem is we need to perform a
> >> > CAS2 operation : atomically changes two values (dataref and frag_list)
> >> 
> >> Alas I can't understand why do you think these clone and atomic tests
> >> in skb_release_data() don't protect skb_shinfo(skb)->frag_list enough.
> >> 
> > 
> > It was early in the morning, before a cup of tea.
> > 
> > David only had to set frag_list in the new shinfo, not the old.
> 
> Ok, how does this look?
> 
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 752c197..aaa9750 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -327,6 +327,32 @@ static void skb_clone_fraglist(struct sk_buff *skb)
>  		skb_get(list);
>  }
>  
> +static struct sk_buff *skb_copy_fraglist(struct sk_buff *parent,
> +					 gfp_t gfp_mask)
> +{
> +	struct sk_buff *first_skb = NULL;
> +	struct sk_buff *prev_skb = NULL;
> +	struct sk_buff *skb;
> +
> +	skb_walk_frags(parent, skb) {
> +		struct sk_buff *nskb = pskb_copy(skb, gfp_mask);
> +
> +		if (!nskb)
> +			goto fail;
> +		if (!first_skb)
> +			first_skb = skb;

Probably here and below: "= nskb"

> +		else
> +			prev_skb->next = skb;
> +		prev_skb = skb;
> +	}
> +
> +	return first_skb;
> +
> +fail:

With "if (first_skb)" here it would look better to me even if it
currently doesn't matter.

Otherwise seems OK, but I still would like to know the scenario
demanding this change.

Jarek P.

> +	skb_drop_list(&first_skb);
> +	return NULL;
> +}
> +
>  static void skb_release_data(struct sk_buff *skb)
>  {
>  	if (!skb->cloned ||
> @@ -775,11 +801,12 @@ EXPORT_SYMBOL(pskb_copy);
>  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>  		     gfp_t gfp_mask)
>  {
> -	int i;
> -	u8 *data;
>  	int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
> -	long off;
> +	struct skb_shared_info *new_shinfo;
>  	bool fastpath;
> +	u8 *data;
> +	long off;
> +	int i;
>  
>  	BUG_ON(nhead < 0);
>  
> @@ -797,8 +824,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>  	 */
>  	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
>  
> -	memcpy((struct skb_shared_info *)(data + size),
> -	       skb_shinfo(skb),
> +	new_shinfo = (struct skb_shared_info *)(data + size);
> +	memcpy(new_shinfo, skb_shinfo(skb),
>  	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
>  
>  	/* Check if we can avoid taking references on fragments if we own
> @@ -815,14 +842,20 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>  	if (fastpath) {
>  		kfree(skb->head);
>  	} else {
> +		if (skb_has_frag_list(skb)) {
> +			struct sk_buff *new_list;
> +
> +			new_list = skb_copy_fraglist(skb, gfp_mask);
> +			if (!new_list)
> +				goto free_data;
> +			new_shinfo->frag_list = new_list;
> +		}
>  		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
>  			get_page(skb_shinfo(skb)->frags[i].page);
>  
> -		if (skb_has_frag_list(skb))
> -			skb_clone_fraglist(skb);
> -
>  		skb_release_data(skb);
>  	}
> +
>  	off = (data + nhead) - skb->head;
>  
>  	skb->head     = data;
> @@ -848,6 +881,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
>  	atomic_set(&skb_shinfo(skb)->dataref, 1);
>  	return 0;
>  
> +free_data:
> +	kfree(data);
>  nodata:
>  	return -ENOMEM;
>  }
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 12, 2010, 3:30 a.m. UTC | #2
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sat, 11 Sep 2010 14:31:40 +0200

> Otherwise seems OK, but I still would like to know the scenario
> demanding this change.

I want to make sk_buff use list_head, including all uses such as
frag_list et al.

If the frag_list chain can be shared, a doubly linked list cannot be
used.

This is someting I've been gradually working on now for more than 2
years :)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 12, 2010, 10:45 a.m. UTC | #3
On Sat, Sep 11, 2010 at 08:30:02PM -0700, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Sat, 11 Sep 2010 14:31:40 +0200
> 
> > Otherwise seems OK, but I still would like to know the scenario
> > demanding this change.
> 
> I want to make sk_buff use list_head, including all uses such as
> frag_list et al.
> 
> If the frag_list chain can be shared, a doubly linked list cannot be
> used.
> 
> This is someting I've been gradually working on now for more than 2
> years :)

Hmm... Then the first message/changelog in this thread seems to
describe the future bug, only with doubly linked lists. If so, it was
a bit misleading to me ;-)

Then a few more questions:
1) if doubly linked lists really require such pskb_copying, isn't it
   all too costly?
2) why skb_clone isn't enough instead of pskb_copy?
3) since skb_clone has some cost too, why e.g. saving only the pointer
   to the tail of the list in skb_shared_info isn't enough?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 12, 2010, 10:58 a.m. UTC | #4
On Sun, Sep 12, 2010 at 12:45:34PM +0200, Jarek Poplawski wrote:
> On Sat, Sep 11, 2010 at 08:30:02PM -0700, David Miller wrote:
> > From: Jarek Poplawski <jarkao2@gmail.com>
> > Date: Sat, 11 Sep 2010 14:31:40 +0200
> > 
> > > Otherwise seems OK, but I still would like to know the scenario
> > > demanding this change.
> > 
> > I want to make sk_buff use list_head, including all uses such as
> > frag_list et al.
> > 
> > If the frag_list chain can be shared, a doubly linked list cannot be
> > used.
> > 
> > This is someting I've been gradually working on now for more than 2
> > years :)
> 
> Hmm... Then the first message/changelog in this thread seems to
> describe the future bug, only with doubly linked lists. If so, it was
> a bit misleading to me ;-)
> 
> Then a few more questions:
> 1) if doubly linked lists really require such pskb_copying, isn't it
>    all too costly?
> 2) why skb_clone isn't enough instead of pskb_copy?
> 3) since skb_clone has some cost too, why e.g. saving only the pointer
>    to the tail of the list in skb_shared_info isn't enough?
...3a) IOW, do we really need this double linking...

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 12, 2010, 3:58 p.m. UTC | #5
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 12 Sep 2010 12:45:34 +0200

> Then a few more questions:
> 1) if doubly linked lists really require such pskb_copying, isn't it
>    all too costly?

In the common case the data reference will be one, so we will not
copy.

> 2) why skb_clone isn't enough instead of pskb_copy?

Can't share the metadata.

> 3) since skb_clone has some cost too, why e.g. saving only the pointer
>    to the tail of the list in skb_shared_info isn't enough?

Then we won't get the rest of the advantages of using list_head such
as prefetching during traversals, automatic debugging facilities, et al.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 12, 2010, 4:13 p.m. UTC | #6
BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how
will you sync that tail pointer in all of the shinfo instances
referencing the frag list?

It simply can't work, we have to copy.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Pfaff Sept. 12, 2010, 7:55 p.m. UTC | #7
David Miller <davem@davemloft.net> writes:

>> 3) since skb_clone has some cost too, why e.g. saving only the pointer
>>    to the tail of the list in skb_shared_info isn't enough?
>
> Then we won't get the rest of the advantages of using list_head such
> as prefetching during traversals, automatic debugging facilities, et al.

Did you see the recent patch from Andi Kleen where he proposes
removing this prefetching in most situations because the costs
outweigh the benefits on most modern architectures?
        http://permalink.gmane.org/gmane.linux.kernel/1033281

I'm not saying that list_head doesn't have other advantages
though.
David Miller Sept. 12, 2010, 8:24 p.m. UTC | #8
From: Ben Pfaff <blp@cs.stanford.edu>
Date: Sun, 12 Sep 2010 12:55:54 -0700

> David Miller <davem@davemloft.net> writes:
> 
>>> 3) since skb_clone has some cost too, why e.g. saving only the pointer
>>>    to the tail of the list in skb_shared_info isn't enough?
>>
>> Then we won't get the rest of the advantages of using list_head such
>> as prefetching during traversals, automatic debugging facilities, et al.
> 
> Did you see the recent patch from Andi Kleen where he proposes
> removing this prefetching in most situations because the costs
> outweigh the benefits on most modern architectures?
>         http://permalink.gmane.org/gmane.linux.kernel/1033281
> 
> I'm not saying that list_head doesn't have other advantages
> though.

Yes I saw it and I somewhat disagree with him, but don't care
enough to argue with him about it.  There are much more important
things to apply my mind and time to at the moment :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 12, 2010, 8:45 p.m. UTC | #9
On Sun, Sep 12, 2010 at 08:58:33AM -0700, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Sun, 12 Sep 2010 12:45:34 +0200
> 
> > Then a few more questions:
> > 1) if doubly linked lists really require such pskb_copying, isn't it
> >    all too costly?
> 
> In the common case the data reference will be one, so we will not
> copy.

Even if so, one such a case on the fast path should hit performance,
so it would need special reviewing.

> 
> > 2) why skb_clone isn't enough instead of pskb_copy?
> 
> Can't share the metadata.

I'd really like to understand why the change in handling next/prev
should affect more than skb pointers wrt. current sharing.

> 
> > 3) since skb_clone has some cost too, why e.g. saving only the pointer
> >    to the tail of the list in skb_shared_info isn't enough?
> 
> Then we won't get the rest of the advantages of using list_head such
> as prefetching during traversals, automatic debugging facilities, et al.

Right, we need to sum pros and cons. So, what's the pros? ;-)

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 12, 2010, 8:57 p.m. UTC | #10
On Sun, Sep 12, 2010 at 09:13:53AM -0700, David Miller wrote:
> 
> BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how
> will you sync that tail pointer in all of the shinfo instances
> referencing the frag list?
> 
> It simply can't work, we have to copy.

The question is if we need to sync at all? This is shared data at the
moment, so I can't imagine how the list (especialy doubly linked)
could be changed without locking? And even if it's possible, I doubt
copying e.g. like in your current patch can help when an skb is added
at the tail later.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 12, 2010, 10:08 p.m. UTC | #11
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sun, 12 Sep 2010 22:57:22 +0200

> On Sun, Sep 12, 2010 at 09:13:53AM -0700, David Miller wrote:
>> 
>> BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how
>> will you sync that tail pointer in all of the shinfo instances
>> referencing the frag list?
>> 
>> It simply can't work, we have to copy.
> 
> The question is if we need to sync at all? This is shared data at the
> moment, so I can't imagine how the list (especialy doubly linked)
> could be changed without locking? And even if it's possible, I doubt
> copying e.g. like in your current patch can help when an skb is added
> at the tail later.

That's the fundamental issue.

If you look, everywhere we curently do that trick of "use the
skb->prev pointer to remmeber the frag_list tail" the code knows
it has exclusive access to both the skb metadata and the
underlying data.

But for modifications of the frag list during the SKBs lifetime
that's another issue, entirely.  All of these functions trimming
the head or tail of the SKB data which can modify the frag
list elements, they can be called from all kinds of contexts.
Look for Alexey Kuznetsov's comments in skbuff.c that read
"mincing fragments" and similar.

The real win with my work is complete unification of all list
handling, and making our packet handling code much more "hackable"
by non-networking kernel hackers.

Really we have the last major core datastructures that do not
use standard lists, and I'm going to convert it so we can
be sane like the rest of the kernel. :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 13, 2010, 7:49 a.m. UTC | #12
On 2010-09-13 00:08, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Sun, 12 Sep 2010 22:57:22 +0200
> 
>> On Sun, Sep 12, 2010 at 09:13:53AM -0700, David Miller wrote:
>>>
>>> BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how
>>> will you sync that tail pointer in all of the shinfo instances
>>> referencing the frag list?
>>>
>>> It simply can't work, we have to copy.
>>
>> The question is if we need to sync at all? This is shared data at the
>> moment, so I can't imagine how the list (especialy doubly linked)
>> could be changed without locking? And even if it's possible, I doubt
>> copying e.g. like in your current patch can help when an skb is added
>> at the tail later.
> 
> That's the fundamental issue.
> 
> If you look, everywhere we curently do that trick of "use the
> skb->prev pointer to remmeber the frag_list tail" the code knows
> it has exclusive access to both the skb metadata and the
> underlying data.
> 
> But for modifications of the frag list during the SKBs lifetime
> that's another issue, entirely.  All of these functions trimming
> the head or tail of the SKB data which can modify the frag
> list elements, they can be called from all kinds of contexts.
> Look for Alexey Kuznetsov's comments in skbuff.c that read
> "mincing fragments" and similar.

OK, I've found the skb_cow_data() comments, but I can see there only
a modification of copied data.

> 
> The real win with my work is complete unification of all list
> handling, and making our packet handling code much more "hackable"
> by non-networking kernel hackers.
> 
> Really we have the last major core datastructures that do not
> use standard lists, and I'm going to convert it so we can
> be sane like the rest of the kernel. :-)

I guess we can stay with different opinions, no problem, at least to
me. IMHO, considering the need for additional copying or even only
cloning data where it's not currently done, doubly linked list is
too costly for the frag_list. It would affect pskb_expand_head(),
pskb_copy() but probably also skb_segment(), and maybe more. So, with
GROwing(!) importance of fragmented packets, I doubt this copying will
be as unlikely as presumed.

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 20, 2010, 12:17 a.m. UTC | #13
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Sat, 11 Sep 2010 14:31:40 +0200

> Otherwise seems OK, but I still would like to know the scenario
> demanding this change.

Ok Jarek, after some more consideration I agree with you.
Removing this kind of sharing would be unwise, ho hum...

While pondering over this I thought about why we even need
frag lists at all.  We need them for two reasons:

1) Because we have an inability to turn kmalloc data references into
   page based ones easily.

   We've run into this sort of problem with socket splice() too.

2) For recording the segmentation points of a fragmented packet.

We definitely don't use frag lists for performance, if you look
in the GRO history the GRO code specifically has been changed
to prefer accumulating into the page vector whenever possible
because using frag lists is a lot slower.

Anyways, even if we somehow solved #1 we'd still have a lot of
code (even bluetooth) using it for the sake of issue #2.

So for the time being there is no clear path for trying to
eliminate frag lists altogether if we wanted to do that either.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 20, 2010, 7:21 a.m. UTC | #14
On Sun, Sep 19, 2010 at 05:17:25PM -0700, David Miller wrote:
> From: Jarek Poplawski <jarkao2@gmail.com>
> Date: Sat, 11 Sep 2010 14:31:40 +0200
> 
> > Otherwise seems OK, but I still would like to know the scenario
> > demanding this change.
> 
> Ok Jarek, after some more consideration I agree with you.
> Removing this kind of sharing would be unwise, ho hum...
> 
> While pondering over this I thought about why we even need
> frag lists at all.  We need them for two reasons:
> 
> 1) Because we have an inability to turn kmalloc data references into
>    page based ones easily.
> 
>    We've run into this sort of problem with socket splice() too.
> 
> 2) For recording the segmentation points of a fragmented packet.
> 
> We definitely don't use frag lists for performance, if you look
> in the GRO history the GRO code specifically has been changed
> to prefer accumulating into the page vector whenever possible
> because using frag lists is a lot slower.

Yes, and GRO made frag lists more popular btw.

> Anyways, even if we somehow solved #1 we'd still have a lot of
> code (even bluetooth) using it for the sake of issue #2.

Since there exist drivers using entirely paged skbs and napi_gro_frags
path I can't see why #1 or #2 could be unsolvable elsewhere.

> So for the time being there is no clear path for trying to
> eliminate frag lists altogether if we wanted to do that either.

Probably we could start from enhacing moving drivers to paged skbs
where possible. And maybe simplifying the skb model by not allowing
frags and frag lists together?

Btw, I wonder what is the exact reason we can't use only
NET_SKBUFF_DATA_USES_OFFSET?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 20, 2010, 9:02 a.m. UTC | #15
Le lundi 20 septembre 2010 à 07:21 +0000, Jarek Poplawski a écrit :

> Probably we could start from enhacing moving drivers to paged skbs
> where possible. And maybe simplifying the skb model by not allowing
> frags and frag lists together?
> 

Sure. I believe current model, pre-allocating skb in huge tx rings is a
waste of mem bandwidth anyway. (I am refering to the struct sk_buff
itself, not the payload part)

Of course some drivers are doing it right, using netdev_alloc_skb()
right before feeding this skb to network stack, not an old one.

> Btw, I wonder what is the exact reason we can't use only
> NET_SKBUFF_DATA_USES_OFFSET?

I see no real reason.

On 32bit arches, it might be faster to manipulate pointers, and not
'base+offset' values.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 20, 2010, 9:14 a.m. UTC | #16
On Mon, Sep 20, 2010 at 11:02:00AM +0200, Eric Dumazet wrote:
> Le lundi 20 septembre 2010 ?? 07:21 +0000, Jarek Poplawski a écrit :
> 
> > Probably we could start from enhacing moving drivers to paged skbs
> > where possible. And maybe simplifying the skb model by not allowing
> > frags and frag lists together?
> > 
> 
> Sure. I believe current model, pre-allocating skb in huge tx rings is a
> waste of mem bandwidth anyway. (I am refering to the struct sk_buff
> itself, not the payload part)
> 
> Of course some drivers are doing it right, using netdev_alloc_skb()
> right before feeding this skb to network stack, not an old one.
> 
> > Btw, I wonder what is the exact reason we can't use only
> > NET_SKBUFF_DATA_USES_OFFSET?
> 
> I see no real reason.
> 
> On 32bit arches, it might be faster to manipulate pointers, and not
> 'base+offset' values.

The only reason is code readability and maintenance. Did anybody check
how much faster?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jarek Poplawski Sept. 20, 2010, 12:12 p.m. UTC | #17
On Mon, Sep 20, 2010 at 09:14:25AM +0000, Jarek Poplawski wrote:
> On Mon, Sep 20, 2010 at 11:02:00AM +0200, Eric Dumazet wrote:
> > Le lundi 20 septembre 2010 ?? 07:21 +0000, Jarek Poplawski a écrit :
> > > Btw, I wonder what is the exact reason we can't use only
> > > NET_SKBUFF_DATA_USES_OFFSET?
> > 
> > I see no real reason.
> > 
> > On 32bit arches, it might be faster to manipulate pointers, and not
> > 'base+offset' values.
> 
> The only reason is code readability and maintenance. Did anybody check
> how much faster?

Hmm... I probably misread your reasoning. So, if no real reason,
really, how about turning this NET_SKBUFF_DATA_USES_OFFSET on
unconditionally in net-next, until somebody spots the difference?

Jarek P.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 20, 2010, 12:40 p.m. UTC | #18
Le lundi 20 septembre 2010 à 12:12 +0000, Jarek Poplawski a écrit :

> Hmm... I probably misread your reasoning. So, if no real reason,
> really, how about turning this NET_SKBUFF_DATA_USES_OFFSET on
> unconditionally in net-next, until somebody spots the difference?

Yes, but most developpers use 64bit kernels anyway, I suspect nobody
will ever notice :(

Here (with a typical config), here is the vmlinux size before and after
this patch :

$ size vmlinux.old
   text	   data	    bss	    dec	    hex	filename
6061748	 640208	7285056	13987012	 d56cc4	vmlinux.old

$ size vmlinux
   text	   data	    bss	    dec	    hex	filename
6070420	 640208	7285056	13995684	 d58ea4	vmlinux


Thats 8672 bytes of text increase.

(1330326 instructions instead of 1328472 -> 1854 more instructions)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 20, 2010, 4:59 p.m. UTC | #19
From: Jarek Poplawski <jarkao2@gmail.com>
Date: Mon, 20 Sep 2010 07:21:49 +0000

> On Sun, Sep 19, 2010 at 05:17:25PM -0700, David Miller wrote:
>> We definitely don't use frag lists for performance, if you look
>> in the GRO history the GRO code specifically has been changed
>> to prefer accumulating into the page vector whenever possible
>> because using frag lists is a lot slower.
> 
> Yes, and GRO made frag lists more popular btw.

Yes.  However, realize that this is only really a legacy from
inet_lro.

These drivers could restructure themselves to operate on pages.
I am pretty sure the hardware would be amicable to this and it
would likely improve performance just as favoring page vector
accumulation did for GRO.

> Btw, I wonder what is the exact reason we can't use only
> NET_SKBUFF_DATA_USES_OFFSET?

As Eric explained, and then demonstrated, it generates a non-trivial
increased amount of code.

This is almost certainly why Arnaldo didn't make it unconditional
back when he implemented the SKB data offset changes for 64-bit.

In my opinion, this duality of SKB pointer handling never causes
real problems because any change mistakenly accessing the pointers
directly usually gets caught by the first person who builds it on
64-bit :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 752c197..aaa9750 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -327,6 +327,32 @@  static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
+static struct sk_buff *skb_copy_fraglist(struct sk_buff *parent,
+					 gfp_t gfp_mask)
+{
+	struct sk_buff *first_skb = NULL;
+	struct sk_buff *prev_skb = NULL;
+	struct sk_buff *skb;
+
+	skb_walk_frags(parent, skb) {
+		struct sk_buff *nskb = pskb_copy(skb, gfp_mask);
+
+		if (!nskb)
+			goto fail;
+		if (!first_skb)
+			first_skb = skb;
+		else
+			prev_skb->next = skb;
+		prev_skb = skb;
+	}
+
+	return first_skb;
+
+fail:
+	skb_drop_list(&first_skb);
+	return NULL;
+}
+
 static void skb_release_data(struct sk_buff *skb)
 {
 	if (!skb->cloned ||
@@ -775,11 +801,12 @@  EXPORT_SYMBOL(pskb_copy);
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		     gfp_t gfp_mask)
 {
-	int i;
-	u8 *data;
 	int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail;
-	long off;
+	struct skb_shared_info *new_shinfo;
 	bool fastpath;
+	u8 *data;
+	long off;
+	int i;
 
 	BUG_ON(nhead < 0);
 
@@ -797,8 +824,8 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	 */
 	memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);
 
-	memcpy((struct skb_shared_info *)(data + size),
-	       skb_shinfo(skb),
+	new_shinfo = (struct skb_shared_info *)(data + size);
+	memcpy(new_shinfo, skb_shinfo(skb),
 	       offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));
 
 	/* Check if we can avoid taking references on fragments if we own
@@ -815,14 +842,20 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	if (fastpath) {
 		kfree(skb->head);
 	} else {
+		if (skb_has_frag_list(skb)) {
+			struct sk_buff *new_list;
+
+			new_list = skb_copy_fraglist(skb, gfp_mask);
+			if (!new_list)
+				goto free_data;
+			new_shinfo->frag_list = new_list;
+		}
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 			get_page(skb_shinfo(skb)->frags[i].page);
 
-		if (skb_has_frag_list(skb))
-			skb_clone_fraglist(skb);
-
 		skb_release_data(skb);
 	}
+
 	off = (data + nhead) - skb->head;
 
 	skb->head     = data;
@@ -848,6 +881,8 @@  int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 	return 0;
 
+free_data:
+	kfree(data);
 nodata:
 	return -ENOMEM;
 }