Message ID | 20100910.125449.235704956.davem@davemloft.net |
---|---|
State | Rejected, archived |
Delegated to: | David Miller |
Headers | show |
On Fri, Sep 10, 2010 at 12:54:49PM -0700, David Miller wrote: > From: Eric Dumazet <eric.dumazet@gmail.com> > Date: Tue, 07 Sep 2010 11:37:28 +0200 > > > Le mardi 07 septembre 2010 ? 09:16 +0000, Jarek Poplawski a écrit : > >> On 2010-09-07 07:02, Eric Dumazet wrote: > > > >> > > >> > I understand what you want to do, but problem is we need to perform a > >> > CAS2 operation : atomically changes two values (dataref and frag_list) > >> > >> Alas I can't understand why do you think these clone and atomic tests > >> in skb_release_data() don't protect skb_shinfo(skb)->frag_list enough. > >> > > > > It was early in the morning, before a cup of tea. > > > > David only had to set frag_list in the new shinfo, not the old. > > Ok, how does this look? > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 752c197..aaa9750 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -327,6 +327,32 @@ static void skb_clone_fraglist(struct sk_buff *skb) > skb_get(list); > } > > +static struct sk_buff *skb_copy_fraglist(struct sk_buff *parent, > + gfp_t gfp_mask) > +{ > + struct sk_buff *first_skb = NULL; > + struct sk_buff *prev_skb = NULL; > + struct sk_buff *skb; > + > + skb_walk_frags(parent, skb) { > + struct sk_buff *nskb = pskb_copy(skb, gfp_mask); > + > + if (!nskb) > + goto fail; > + if (!first_skb) > + first_skb = skb; Probably here and below: "= nskb" > + else > + prev_skb->next = skb; > + prev_skb = skb; > + } > + > + return first_skb; > + > +fail: With "if (first_skb)" here it would look better to me even if it currently doesn't matter. Otherwise seems OK, but I still would like to know the scenario demanding this change. Jarek P. > + skb_drop_list(&first_skb); > + return NULL; > +} > + > static void skb_release_data(struct sk_buff *skb) > { > if (!skb->cloned || > @@ -775,11 +801,12 @@ EXPORT_SYMBOL(pskb_copy); > int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > gfp_t gfp_mask) > { > - int i; > - u8 *data; > int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; > - long off; > + struct skb_shared_info *new_shinfo; > bool fastpath; > + u8 *data; > + long off; > + int i; > > BUG_ON(nhead < 0); > > @@ -797,8 +824,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > */ > memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); > > - memcpy((struct skb_shared_info *)(data + size), > - skb_shinfo(skb), > + new_shinfo = (struct skb_shared_info *)(data + size); > + memcpy(new_shinfo, skb_shinfo(skb), > offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); > > /* Check if we can avoid taking references on fragments if we own > @@ -815,14 +842,20 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > if (fastpath) { > kfree(skb->head); > } else { > + if (skb_has_frag_list(skb)) { > + struct sk_buff *new_list; > + > + new_list = skb_copy_fraglist(skb, gfp_mask); > + if (!new_list) > + goto free_data; > + new_shinfo->frag_list = new_list; > + } > for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) > get_page(skb_shinfo(skb)->frags[i].page); > > - if (skb_has_frag_list(skb)) > - skb_clone_fraglist(skb); > - > skb_release_data(skb); > } > + > off = (data + nhead) - skb->head; > > skb->head = data; > @@ -848,6 +881,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, > atomic_set(&skb_shinfo(skb)->dataref, 1); > return 0; > > +free_data: > + kfree(data); > nodata: > return -ENOMEM; > } -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Jarek Poplawski <jarkao2@gmail.com> Date: Sat, 11 Sep 2010 14:31:40 +0200 > Otherwise seems OK, but I still would like to know the scenario > demanding this change. I want to make sk_buff use list_head, including all uses such as frag_list et al. If the frag_list chain can be shared, a doubly linked list cannot be used. This is someting I've been gradually working on now for more than 2 years :) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Sep 11, 2010 at 08:30:02PM -0700, David Miller wrote: > From: Jarek Poplawski <jarkao2@gmail.com> > Date: Sat, 11 Sep 2010 14:31:40 +0200 > > > Otherwise seems OK, but I still would like to know the scenario > > demanding this change. > > I want to make sk_buff use list_head, including all uses such as > frag_list et al. > > If the frag_list chain can be shared, a doubly linked list cannot be > used. > > This is someting I've been gradually working on now for more than 2 > years :) Hmm... Then the first message/changelog in this thread seems to describe the future bug, only with doubly linked lists. If so, it was a bit misleading to me ;-) Then a few more questions: 1) if doubly linked lists really require such pskb_copying, isn't it all too costly? 2) why skb_clone isn't enough instead of pskb_copy? 3) since skb_clone has some cost too, why e.g. saving only the pointer to the tail of the list in skb_shared_info isn't enough? Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Sep 12, 2010 at 12:45:34PM +0200, Jarek Poplawski wrote: > On Sat, Sep 11, 2010 at 08:30:02PM -0700, David Miller wrote: > > From: Jarek Poplawski <jarkao2@gmail.com> > > Date: Sat, 11 Sep 2010 14:31:40 +0200 > > > > > Otherwise seems OK, but I still would like to know the scenario > > > demanding this change. > > > > I want to make sk_buff use list_head, including all uses such as > > frag_list et al. > > > > If the frag_list chain can be shared, a doubly linked list cannot be > > used. > > > > This is someting I've been gradually working on now for more than 2 > > years :) > > Hmm... Then the first message/changelog in this thread seems to > describe the future bug, only with doubly linked lists. If so, it was > a bit misleading to me ;-) > > Then a few more questions: > 1) if doubly linked lists really require such pskb_copying, isn't it > all too costly? > 2) why skb_clone isn't enough instead of pskb_copy? > 3) since skb_clone has some cost too, why e.g. saving only the pointer > to the tail of the list in skb_shared_info isn't enough? ...3a) IOW, do we really need this double linking... Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Jarek Poplawski <jarkao2@gmail.com> Date: Sun, 12 Sep 2010 12:45:34 +0200 > Then a few more questions: > 1) if doubly linked lists really require such pskb_copying, isn't it > all too costly? In the common case the data reference will be one, so we will not copy. > 2) why skb_clone isn't enough instead of pskb_copy? Can't share the metadata. > 3) since skb_clone has some cost too, why e.g. saving only the pointer > to the tail of the list in skb_shared_info isn't enough? Then we won't get the rest of the advantages of using list_head such as prefetching during traversals, automatic debugging facilities, et al. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how will you sync that tail pointer in all of the shinfo instances referencing the frag list? It simply can't work, we have to copy. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
David Miller <davem@davemloft.net> writes: >> 3) since skb_clone has some cost too, why e.g. saving only the pointer >> to the tail of the list in skb_shared_info isn't enough? > > Then we won't get the rest of the advantages of using list_head such > as prefetching during traversals, automatic debugging facilities, et al. Did you see the recent patch from Andi Kleen where he proposes removing this prefetching in most situations because the costs outweigh the benefits on most modern architectures? http://permalink.gmane.org/gmane.linux.kernel/1033281 I'm not saying that list_head doesn't have other advantages though.
From: Ben Pfaff <blp@cs.stanford.edu> Date: Sun, 12 Sep 2010 12:55:54 -0700 > David Miller <davem@davemloft.net> writes: > >>> 3) since skb_clone has some cost too, why e.g. saving only the pointer >>> to the tail of the list in skb_shared_info isn't enough? >> >> Then we won't get the rest of the advantages of using list_head such >> as prefetching during traversals, automatic debugging facilities, et al. > > Did you see the recent patch from Andi Kleen where he proposes > removing this prefetching in most situations because the costs > outweigh the benefits on most modern architectures? > http://permalink.gmane.org/gmane.linux.kernel/1033281 > > I'm not saying that list_head doesn't have other advantages > though. Yes I saw it and I somewhat disagree with him, but don't care enough to argue with him about it. There are much more important things to apply my mind and time to at the moment :-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Sep 12, 2010 at 08:58:33AM -0700, David Miller wrote: > From: Jarek Poplawski <jarkao2@gmail.com> > Date: Sun, 12 Sep 2010 12:45:34 +0200 > > > Then a few more questions: > > 1) if doubly linked lists really require such pskb_copying, isn't it > > all too costly? > > In the common case the data reference will be one, so we will not > copy. Even if so, one such a case on the fast path should hit performance, so it would need special reviewing. > > > 2) why skb_clone isn't enough instead of pskb_copy? > > Can't share the metadata. I'd really like to understand why the change in handling next/prev should affect more than skb pointers wrt. current sharing. > > > 3) since skb_clone has some cost too, why e.g. saving only the pointer > > to the tail of the list in skb_shared_info isn't enough? > > Then we won't get the rest of the advantages of using list_head such > as prefetching during traversals, automatic debugging facilities, et al. Right, we need to sum pros and cons. So, what's the pros? ;-) Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Sep 12, 2010 at 09:13:53AM -0700, David Miller wrote: > > BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how > will you sync that tail pointer in all of the shinfo instances > referencing the frag list? > > It simply can't work, we have to copy. The question is if we need to sync at all? This is shared data at the moment, so I can't imagine how the list (especialy doubly linked) could be changed without locking? And even if it's possible, I doubt copying e.g. like in your current patch can help when an skb is added at the tail later. Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Jarek Poplawski <jarkao2@gmail.com> Date: Sun, 12 Sep 2010 22:57:22 +0200 > On Sun, Sep 12, 2010 at 09:13:53AM -0700, David Miller wrote: >> >> BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how >> will you sync that tail pointer in all of the shinfo instances >> referencing the frag list? >> >> It simply can't work, we have to copy. > > The question is if we need to sync at all? This is shared data at the > moment, so I can't imagine how the list (especialy doubly linked) > could be changed without locking? And even if it's possible, I doubt > copying e.g. like in your current patch can help when an skb is added > at the tail later. That's the fundamental issue. If you look, everywhere we curently do that trick of "use the skb->prev pointer to remmeber the frag_list tail" the code knows it has exclusive access to both the skb metadata and the underlying data. But for modifications of the frag list during the SKBs lifetime that's another issue, entirely. All of these functions trimming the head or tail of the SKB data which can modify the frag list elements, they can be called from all kinds of contexts. Look for Alexey Kuznetsov's comments in skbuff.c that read "mincing fragments" and similar. The real win with my work is complete unification of all list handling, and making our packet handling code much more "hackable" by non-networking kernel hackers. Really we have the last major core datastructures that do not use standard lists, and I'm going to convert it so we can be sane like the rest of the kernel. :-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2010-09-13 00:08, David Miller wrote: > From: Jarek Poplawski <jarkao2@gmail.com> > Date: Sun, 12 Sep 2010 22:57:22 +0200 > >> On Sun, Sep 12, 2010 at 09:13:53AM -0700, David Miller wrote: >>> >>> BTW, Jarek, as to your idea to store a tail pointer in the shinfo, how >>> will you sync that tail pointer in all of the shinfo instances >>> referencing the frag list? >>> >>> It simply can't work, we have to copy. >> >> The question is if we need to sync at all? This is shared data at the >> moment, so I can't imagine how the list (especialy doubly linked) >> could be changed without locking? And even if it's possible, I doubt >> copying e.g. like in your current patch can help when an skb is added >> at the tail later. > > That's the fundamental issue. > > If you look, everywhere we curently do that trick of "use the > skb->prev pointer to remmeber the frag_list tail" the code knows > it has exclusive access to both the skb metadata and the > underlying data. > > But for modifications of the frag list during the SKBs lifetime > that's another issue, entirely. All of these functions trimming > the head or tail of the SKB data which can modify the frag > list elements, they can be called from all kinds of contexts. > Look for Alexey Kuznetsov's comments in skbuff.c that read > "mincing fragments" and similar. OK, I've found the skb_cow_data() comments, but I can see there only a modification of copied data. > > The real win with my work is complete unification of all list > handling, and making our packet handling code much more "hackable" > by non-networking kernel hackers. > > Really we have the last major core datastructures that do not > use standard lists, and I'm going to convert it so we can > be sane like the rest of the kernel. :-) I guess we can stay with different opinions, no problem, at least to me. IMHO, considering the need for additional copying or even only cloning data where it's not currently done, doubly linked list is too costly for the frag_list. It would affect pskb_expand_head(), pskb_copy() but probably also skb_segment(), and maybe more. So, with GROwing(!) importance of fragmented packets, I doubt this copying will be as unlikely as presumed. Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Jarek Poplawski <jarkao2@gmail.com> Date: Sat, 11 Sep 2010 14:31:40 +0200 > Otherwise seems OK, but I still would like to know the scenario > demanding this change. Ok Jarek, after some more consideration I agree with you. Removing this kind of sharing would be unwise, ho hum... While pondering over this I thought about why we even need frag lists at all. We need them for two reasons: 1) Because we have an inability to turn kmalloc data references into page based ones easily. We've run into this sort of problem with socket splice() too. 2) For recording the segmentation points of a fragmented packet. We definitely don't use frag lists for performance, if you look in the GRO history the GRO code specifically has been changed to prefer accumulating into the page vector whenever possible because using frag lists is a lot slower. Anyways, even if we somehow solved #1 we'd still have a lot of code (even bluetooth) using it for the sake of issue #2. So for the time being there is no clear path for trying to eliminate frag lists altogether if we wanted to do that either. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Sep 19, 2010 at 05:17:25PM -0700, David Miller wrote: > From: Jarek Poplawski <jarkao2@gmail.com> > Date: Sat, 11 Sep 2010 14:31:40 +0200 > > > Otherwise seems OK, but I still would like to know the scenario > > demanding this change. > > Ok Jarek, after some more consideration I agree with you. > Removing this kind of sharing would be unwise, ho hum... > > While pondering over this I thought about why we even need > frag lists at all. We need them for two reasons: > > 1) Because we have an inability to turn kmalloc data references into > page based ones easily. > > We've run into this sort of problem with socket splice() too. > > 2) For recording the segmentation points of a fragmented packet. > > We definitely don't use frag lists for performance, if you look > in the GRO history the GRO code specifically has been changed > to prefer accumulating into the page vector whenever possible > because using frag lists is a lot slower. Yes, and GRO made frag lists more popular btw. > Anyways, even if we somehow solved #1 we'd still have a lot of > code (even bluetooth) using it for the sake of issue #2. Since there exist drivers using entirely paged skbs and napi_gro_frags path I can't see why #1 or #2 could be unsolvable elsewhere. > So for the time being there is no clear path for trying to > eliminate frag lists altogether if we wanted to do that either. Probably we could start from enhacing moving drivers to paged skbs where possible. And maybe simplifying the skb model by not allowing frags and frag lists together? Btw, I wonder what is the exact reason we can't use only NET_SKBUFF_DATA_USES_OFFSET? Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 20 septembre 2010 à 07:21 +0000, Jarek Poplawski a écrit : > Probably we could start from enhacing moving drivers to paged skbs > where possible. And maybe simplifying the skb model by not allowing > frags and frag lists together? > Sure. I believe current model, pre-allocating skb in huge tx rings is a waste of mem bandwidth anyway. (I am refering to the struct sk_buff itself, not the payload part) Of course some drivers are doing it right, using netdev_alloc_skb() right before feeding this skb to network stack, not an old one. > Btw, I wonder what is the exact reason we can't use only > NET_SKBUFF_DATA_USES_OFFSET? I see no real reason. On 32bit arches, it might be faster to manipulate pointers, and not 'base+offset' values. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Sep 20, 2010 at 11:02:00AM +0200, Eric Dumazet wrote: > Le lundi 20 septembre 2010 ?? 07:21 +0000, Jarek Poplawski a écrit : > > > Probably we could start from enhacing moving drivers to paged skbs > > where possible. And maybe simplifying the skb model by not allowing > > frags and frag lists together? > > > > Sure. I believe current model, pre-allocating skb in huge tx rings is a > waste of mem bandwidth anyway. (I am refering to the struct sk_buff > itself, not the payload part) > > Of course some drivers are doing it right, using netdev_alloc_skb() > right before feeding this skb to network stack, not an old one. > > > Btw, I wonder what is the exact reason we can't use only > > NET_SKBUFF_DATA_USES_OFFSET? > > I see no real reason. > > On 32bit arches, it might be faster to manipulate pointers, and not > 'base+offset' values. The only reason is code readability and maintenance. Did anybody check how much faster? Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Sep 20, 2010 at 09:14:25AM +0000, Jarek Poplawski wrote: > On Mon, Sep 20, 2010 at 11:02:00AM +0200, Eric Dumazet wrote: > > Le lundi 20 septembre 2010 ?? 07:21 +0000, Jarek Poplawski a écrit : > > > Btw, I wonder what is the exact reason we can't use only > > > NET_SKBUFF_DATA_USES_OFFSET? > > > > I see no real reason. > > > > On 32bit arches, it might be faster to manipulate pointers, and not > > 'base+offset' values. > > The only reason is code readability and maintenance. Did anybody check > how much faster? Hmm... I probably misread your reasoning. So, if no real reason, really, how about turning this NET_SKBUFF_DATA_USES_OFFSET on unconditionally in net-next, until somebody spots the difference? Jarek P. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Le lundi 20 septembre 2010 à 12:12 +0000, Jarek Poplawski a écrit : > Hmm... I probably misread your reasoning. So, if no real reason, > really, how about turning this NET_SKBUFF_DATA_USES_OFFSET on > unconditionally in net-next, until somebody spots the difference? Yes, but most developpers use 64bit kernels anyway, I suspect nobody will ever notice :( Here (with a typical config), here is the vmlinux size before and after this patch : $ size vmlinux.old text data bss dec hex filename 6061748 640208 7285056 13987012 d56cc4 vmlinux.old $ size vmlinux text data bss dec hex filename 6070420 640208 7285056 13995684 d58ea4 vmlinux Thats 8672 bytes of text increase. (1330326 instructions instead of 1328472 -> 1854 more instructions) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
From: Jarek Poplawski <jarkao2@gmail.com> Date: Mon, 20 Sep 2010 07:21:49 +0000 > On Sun, Sep 19, 2010 at 05:17:25PM -0700, David Miller wrote: >> We definitely don't use frag lists for performance, if you look >> in the GRO history the GRO code specifically has been changed >> to prefer accumulating into the page vector whenever possible >> because using frag lists is a lot slower. > > Yes, and GRO made frag lists more popular btw. Yes. However, realize that this is only really a legacy from inet_lro. These drivers could restructure themselves to operate on pages. I am pretty sure the hardware would be amicable to this and it would likely improve performance just as favoring page vector accumulation did for GRO. > Btw, I wonder what is the exact reason we can't use only > NET_SKBUFF_DATA_USES_OFFSET? As Eric explained, and then demonstrated, it generates a non-trivial increased amount of code. This is almost certainly why Arnaldo didn't make it unconditional back when he implemented the SKB data offset changes for 64-bit. In my opinion, this duality of SKB pointer handling never causes real problems because any change mistakenly accessing the pointers directly usually gets caught by the first person who builds it on 64-bit :-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 752c197..aaa9750 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -327,6 +327,32 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +static struct sk_buff *skb_copy_fraglist(struct sk_buff *parent, + gfp_t gfp_mask) +{ + struct sk_buff *first_skb = NULL; + struct sk_buff *prev_skb = NULL; + struct sk_buff *skb; + + skb_walk_frags(parent, skb) { + struct sk_buff *nskb = pskb_copy(skb, gfp_mask); + + if (!nskb) + goto fail; + if (!first_skb) + first_skb = skb; + else + prev_skb->next = skb; + prev_skb = skb; + } + + return first_skb; + +fail: + skb_drop_list(&first_skb); + return NULL; +} + static void skb_release_data(struct sk_buff *skb) { if (!skb->cloned || @@ -775,11 +801,12 @@ EXPORT_SYMBOL(pskb_copy); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i; - u8 *data; int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; - long off; + struct skb_shared_info *new_shinfo; bool fastpath; + u8 *data; + long off; + int i; BUG_ON(nhead < 0); @@ -797,8 +824,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, */ memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); - memcpy((struct skb_shared_info *)(data + size), - skb_shinfo(skb), + new_shinfo = (struct skb_shared_info *)(data + size); + memcpy(new_shinfo, skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); /* Check if we can avoid taking references on fragments if we own @@ -815,14 +842,20 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (fastpath) { kfree(skb->head); } else { + if (skb_has_frag_list(skb)) { + struct sk_buff *new_list; + + new_list = skb_copy_fraglist(skb, gfp_mask); + if (!new_list) + goto free_data; + new_shinfo->frag_list = new_list; + } for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frag_list(skb)) - skb_clone_fraglist(skb); - skb_release_data(skb); } + off = (data + nhead) - skb->head; skb->head = data; @@ -848,6 +881,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, atomic_set(&skb_shinfo(skb)->dataref, 1); return 0; +free_data: + kfree(data); nodata: return -ENOMEM; }