Message ID | 1400585062-13580-1-git-send-email-igorr@swortex.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, May 20, 2014 at 02:24:21PM +0300, Igor Royzis wrote: > Fix accessing GSO fragments memory (and a possible corruption therefore) after > reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec > orphaned frags which eliminates zero copy advantages. The fix makes the completion > called after all the fragments were processed avoiding unnecessary orphaning/copying > from userspace. > > The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that > hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). > The fix has been verified by running the HCK OffloadLSO test. > > Signed-off-by: Igor Royzis <igorr@swortex.com> > Signed-off-by: Anton Nayshtut <anton@swortex.com> OK but with 1fd819ec there's no corruption, correct? So this patch is in fact an optimization? If true, I'd like to see some performance numbers please. Thanks! > --- > include/linux/skbuff.h | 1 + > net/core/skbuff.c | 18 +++++++++++++----- > 2 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 08074a8..8c49edc 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -287,6 +287,7 @@ struct skb_shared_info { > struct sk_buff *frag_list; > struct skb_shared_hwtstamps hwtstamps; > __be32 ip6_frag_id; > + struct sk_buff *zcopy_src; > > /* > * Warning : all fields before dataref are cleared in __alloc_skb() > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 1b62343..6fa6342 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -610,14 +610,18 @@ EXPORT_SYMBOL(__kfree_skb); > */ > void kfree_skb(struct sk_buff *skb) > { > + struct sk_buff *zcopy_src; > if (unlikely(!skb)) > return; > if (likely(atomic_read(&skb->users) == 1)) > smp_rmb(); > else if (likely(!atomic_dec_and_test(&skb->users))) > return; > + zcopy_src = skb_shinfo(skb)->zcopy_src; > trace_kfree_skb(skb, __builtin_return_address(0)); > __kfree_skb(skb); > + if (unlikely(zcopy_src)) > + kfree_skb(zcopy_src); > } > EXPORT_SYMBOL(kfree_skb); > > @@ -662,14 +666,18 @@ EXPORT_SYMBOL(skb_tx_error); > */ > void consume_skb(struct sk_buff *skb) > { > + struct sk_buff *zcopy_src; > if (unlikely(!skb)) > return; > if (likely(atomic_read(&skb->users) == 1)) > smp_rmb(); > else if (likely(!atomic_dec_and_test(&skb->users))) > return; > + zcopy_src = skb_shinfo(skb)->zcopy_src; > trace_consume_skb(skb); > __kfree_skb(skb); > + if (unlikely(zcopy_src)) > + consume_skb(zcopy_src); > } > EXPORT_SYMBOL(consume_skb); > > @@ -2867,7 +2875,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > skb_frag_t *frag = skb_shinfo(head_skb)->frags; > unsigned int mss = skb_shinfo(head_skb)->gso_size; > unsigned int doffset = head_skb->data - skb_mac_header(head_skb); > - struct sk_buff *frag_skb = head_skb; > unsigned int offset = doffset; > unsigned int tnl_hlen = skb_tnl_header_len(head_skb); > unsigned int headroom; > @@ -2913,7 +2920,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > i = 0; > nfrags = skb_shinfo(list_skb)->nr_frags; > frag = skb_shinfo(list_skb)->frags; > - frag_skb = list_skb; > pos += skb_headlen(list_skb); > > while (pos < offset + len) { > @@ -2975,6 +2981,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > nskb->data - tnl_hlen, > doffset + tnl_hlen); > > + if (skb_shinfo(head_skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { > + skb_shinfo(nskb)->zcopy_src = head_skb; > + atomic_inc(&head_skb->users); > + } > + > if (nskb->len == len + doffset) > goto perform_csum_check; > > @@ -3001,7 +3012,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > i = 0; > nfrags = skb_shinfo(list_skb)->nr_frags; > frag = skb_shinfo(list_skb)->frags; > - frag_skb = list_skb; > > BUG_ON(!nfrags); > > @@ -3016,8 +3026,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > goto err; > } > > - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) > - goto err; > > *nskb_frag = *frag; > __skb_frag_ref(nskb_frag); > -- > 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Michael, You're absolutely right. We detected the actual corruption running MS HCK on earlier kernels, before the 1fd819ec, so the patch was developed as a fix for this issue. However, 1fd819ec fixes the corruption and now it's only an optimization that re-enables the zero copy for this case. We're collecting the numbers right now and will post them as soon as possible. Best Regards, Anton On 5/20/2014 2:50 PM, Michael S. Tsirkin wrote: > On Tue, May 20, 2014 at 02:24:21PM +0300, Igor Royzis wrote: >> Fix accessing GSO fragments memory (and a possible corruption therefore) after >> reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec >> orphaned frags which eliminates zero copy advantages. The fix makes the completion >> called after all the fragments were processed avoiding unnecessary orphaning/copying >> from userspace. >> >> The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that >> hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). >> The fix has been verified by running the HCK OffloadLSO test. >> >> Signed-off-by: Igor Royzis <igorr@swortex.com> >> Signed-off-by: Anton Nayshtut <anton@swortex.com> > > OK but with 1fd819ec there's no corruption, correct? > So this patch is in fact an optimization? > If true, I'd like to see some performance numbers please. > > Thanks! > >> --- >> include/linux/skbuff.h | 1 + >> net/core/skbuff.c | 18 +++++++++++++----- >> 2 files changed, 14 insertions(+), 5 deletions(-) >> >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index 08074a8..8c49edc 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -287,6 +287,7 @@ struct skb_shared_info { >> struct sk_buff *frag_list; >> struct skb_shared_hwtstamps hwtstamps; >> __be32 ip6_frag_id; >> + struct sk_buff *zcopy_src; >> >> /* >> * Warning : all fields before dataref are cleared in __alloc_skb() >> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> index 1b62343..6fa6342 100644 >> --- a/net/core/skbuff.c >> +++ b/net/core/skbuff.c >> @@ -610,14 +610,18 @@ EXPORT_SYMBOL(__kfree_skb); >> */ >> void kfree_skb(struct sk_buff *skb) >> { >> + struct sk_buff *zcopy_src; >> if (unlikely(!skb)) >> return; >> if (likely(atomic_read(&skb->users) == 1)) >> smp_rmb(); >> else if (likely(!atomic_dec_and_test(&skb->users))) >> return; >> + zcopy_src = skb_shinfo(skb)->zcopy_src; >> trace_kfree_skb(skb, __builtin_return_address(0)); >> __kfree_skb(skb); >> + if (unlikely(zcopy_src)) >> + kfree_skb(zcopy_src); >> } >> EXPORT_SYMBOL(kfree_skb); >> >> @@ -662,14 +666,18 @@ EXPORT_SYMBOL(skb_tx_error); >> */ >> void consume_skb(struct sk_buff *skb) >> { >> + struct sk_buff *zcopy_src; >> if (unlikely(!skb)) >> return; >> if (likely(atomic_read(&skb->users) == 1)) >> smp_rmb(); >> else if (likely(!atomic_dec_and_test(&skb->users))) >> return; >> + zcopy_src = skb_shinfo(skb)->zcopy_src; >> trace_consume_skb(skb); >> __kfree_skb(skb); >> + if (unlikely(zcopy_src)) >> + consume_skb(zcopy_src); >> } >> EXPORT_SYMBOL(consume_skb); >> >> @@ -2867,7 +2875,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >> skb_frag_t *frag = skb_shinfo(head_skb)->frags; >> unsigned int mss = skb_shinfo(head_skb)->gso_size; >> unsigned int doffset = head_skb->data - skb_mac_header(head_skb); >> - struct sk_buff *frag_skb = head_skb; >> unsigned int offset = doffset; >> unsigned int tnl_hlen = skb_tnl_header_len(head_skb); >> unsigned int headroom; >> @@ -2913,7 +2920,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >> i = 0; >> nfrags = skb_shinfo(list_skb)->nr_frags; >> frag = skb_shinfo(list_skb)->frags; >> - frag_skb = list_skb; >> pos += skb_headlen(list_skb); >> >> while (pos < offset + len) { >> @@ -2975,6 +2981,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >> nskb->data - tnl_hlen, >> doffset + tnl_hlen); >> >> + if (skb_shinfo(head_skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { >> + skb_shinfo(nskb)->zcopy_src = head_skb; >> + atomic_inc(&head_skb->users); >> + } >> + >> if (nskb->len == len + doffset) >> goto perform_csum_check; >> >> @@ -3001,7 +3012,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >> i = 0; >> nfrags = skb_shinfo(list_skb)->nr_frags; >> frag = skb_shinfo(list_skb)->frags; >> - frag_skb = list_skb; >> >> BUG_ON(!nfrags); >> >> @@ -3016,8 +3026,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >> goto err; >> } >> >> - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) >> - goto err; >> >> *nskb_frag = *frag; >> __skb_frag_ref(nskb_frag); >> -- >> 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2014-05-20 at 14:24 +0300, Igor Royzis wrote: > Fix accessing GSO fragments memory (and a possible corruption therefore) after > reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec > orphaned frags which eliminates zero copy advantages. The fix makes the completion > called after all the fragments were processed avoiding unnecessary orphaning/copying > from userspace. > > The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that > hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). > The fix has been verified by running the HCK OffloadLSO test. > > Signed-off-by: Igor Royzis <igorr@swortex.com> > Signed-off-by: Anton Nayshtut <anton@swortex.com> > --- > include/linux/skbuff.h | 1 + > net/core/skbuff.c | 18 +++++++++++++----- > 2 files changed, 14 insertions(+), 5 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 08074a8..8c49edc 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -287,6 +287,7 @@ struct skb_shared_info { > struct sk_buff *frag_list; > struct skb_shared_hwtstamps hwtstamps; > __be32 ip6_frag_id; > + struct sk_buff *zcopy_src; > Before your patch : sizeof(struct skb_shared_info)=0x140 offsetof(struct skb_shared_info, frags[1])=0x40 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) -> 0x140 After your patch : sizeof(struct skb_shared_info)=0x148 offsetof(struct skb_shared_info, frags[1])=0x48 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) -> 0x180 Thats a serious bump, because it increases all skb truesizes, and typical skb with one fragment will use 2 cache lines instead of one in struct skb_shared_info, so this adds memory pressure in fast path. So while this patch might increase performance for some workloads, it generally decreases performance on many others. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2014-05-20 at 07:28 -0700, Eric Dumazet wrote: > On Tue, 2014-05-20 at 14:24 +0300, Igor Royzis wrote: > > Fix accessing GSO fragments memory (and a possible corruption therefore) after > > reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec > > orphaned frags which eliminates zero copy advantages. The fix makes the completion > > called after all the fragments were processed avoiding unnecessary orphaning/copying > > from userspace. > > > > The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that > > hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). > > The fix has been verified by running the HCK OffloadLSO test. > > It looks like all segments (generated by GSO segmentation) should share original ubuf_info, and that it should be refcounted. A nightmare I suppose... (transferring the ubuf_info from original skb to last segment would be racy, as the last segment could be freed _before_ previous ones, in case a drop happens in qdisc layer, or packets are reordered by netem) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 20, 2014 at 09:05:38AM -0700, Eric Dumazet wrote: > On Tue, 2014-05-20 at 07:28 -0700, Eric Dumazet wrote: > > On Tue, 2014-05-20 at 14:24 +0300, Igor Royzis wrote: > > > Fix accessing GSO fragments memory (and a possible corruption therefore) after > > > reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec > > > orphaned frags which eliminates zero copy advantages. The fix makes the completion > > > called after all the fragments were processed avoiding unnecessary orphaning/copying > > > from userspace. > > > > > > The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that > > > hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). > > > The fix has been verified by running the HCK OffloadLSO test. > > > > > It looks like all segments (generated by GSO segmentation) should share > original ubuf_info, and that it should be refcounted. > > A nightmare I suppose... That's what skb_frag_ref tried to do only for fragments, I guess. > (transferring the ubuf_info from original skb to last segment would be > racy, as the last segment could be freed _before_ previous ones, in case > a drop happens in qdisc layer, or packets are reordered by netem) > > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> If true, I'd like to see some performance numbers please. The numbers have been obtained by running iperf between 2 QEMU Win2012 VMs, 4 vCPU/ 4GB RAM each. iperf parameters: -w 256K -l 256K -t 300 Original kernel 3.15.0-rc5: 34.4 Gbytes transferred, 984 Mbits/sec bandwidth. Kernel 3.15.0-rc5 with our patch: 42.5 Gbytes transferred, 1.22 Gbits/sec bandwidth. Overall improvement is about 24%. Below are raw iperf outputs. kernel 3.15.0-rc5: C:\iperf>iperf -c 192.168.11.2 -w 256K -l 256K -t 300 ------------------------------------------------------------ Client connecting to 192.168.11.2, TCP port 5001 TCP window size: 256 KByte ------------------------------------------------------------ [ 3] local 192.168.11.1 port 49167 connected with 192.168.11.2 port 5001 [ ID] Interval Transfer Bandwidth [ 3] 0.0-300.7 sec 34.4 GBytes 984 Mbits/sec kernel 3.15.0-rc5-patched: C:\iperf>iperf -c 192.168.11.2 -w 256K -l 256K -t 300 ------------------------------------------------------------ Client connecting to 192.168.11.2, TCP port 5001 TCP window size: 256 KByte ------------------------------------------------------------ [ 3] local 192.168.11.1 port 49167 connected with 192.168.11.2 port 5001 [ ID] Interval Transfer Bandwidth [ 3] 0.0-300.7 sec 42.5 GBytes 1.22 Gbits/sec On Tue, May 20, 2014 at 2:50 PM, Michael S. Tsirkin <mst@redhat.com> wrote: > > On Tue, May 20, 2014 at 02:24:21PM +0300, Igor Royzis wrote: > > Fix accessing GSO fragments memory (and a possible corruption therefore) after > > reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec > > orphaned frags which eliminates zero copy advantages. The fix makes the completion > > called after all the fragments were processed avoiding unnecessary orphaning/copying > > from userspace. > > > > The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that > > hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). > > The fix has been verified by running the HCK OffloadLSO test. > > > > Signed-off-by: Igor Royzis <igorr@swortex.com> > > Signed-off-by: Anton Nayshtut <anton@swortex.com> > > OK but with 1fd819ec there's no corruption, correct? > So this patch is in fact an optimization? > If true, I'd like to see some performance numbers please. > > Thanks! > > > --- > > include/linux/skbuff.h | 1 + > > net/core/skbuff.c | 18 +++++++++++++----- > > 2 files changed, 14 insertions(+), 5 deletions(-) > > > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > > index 08074a8..8c49edc 100644 > > --- a/include/linux/skbuff.h > > +++ b/include/linux/skbuff.h > > @@ -287,6 +287,7 @@ struct skb_shared_info { > > struct sk_buff *frag_list; > > struct skb_shared_hwtstamps hwtstamps; > > __be32 ip6_frag_id; > > + struct sk_buff *zcopy_src; > > > > /* > > * Warning : all fields before dataref are cleared in __alloc_skb() > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > > index 1b62343..6fa6342 100644 > > --- a/net/core/skbuff.c > > +++ b/net/core/skbuff.c > > @@ -610,14 +610,18 @@ EXPORT_SYMBOL(__kfree_skb); > > */ > > void kfree_skb(struct sk_buff *skb) > > { > > + struct sk_buff *zcopy_src; > > if (unlikely(!skb)) > > return; > > if (likely(atomic_read(&skb->users) == 1)) > > smp_rmb(); > > else if (likely(!atomic_dec_and_test(&skb->users))) > > return; > > + zcopy_src = skb_shinfo(skb)->zcopy_src; > > trace_kfree_skb(skb, __builtin_return_address(0)); > > __kfree_skb(skb); > > + if (unlikely(zcopy_src)) > > + kfree_skb(zcopy_src); > > } > > EXPORT_SYMBOL(kfree_skb); > > > > @@ -662,14 +666,18 @@ EXPORT_SYMBOL(skb_tx_error); > > */ > > void consume_skb(struct sk_buff *skb) > > { > > + struct sk_buff *zcopy_src; > > if (unlikely(!skb)) > > return; > > if (likely(atomic_read(&skb->users) == 1)) > > smp_rmb(); > > else if (likely(!atomic_dec_and_test(&skb->users))) > > return; > > + zcopy_src = skb_shinfo(skb)->zcopy_src; > > trace_consume_skb(skb); > > __kfree_skb(skb); > > + if (unlikely(zcopy_src)) > > + consume_skb(zcopy_src); > > } > > EXPORT_SYMBOL(consume_skb); > > > > @@ -2867,7 +2875,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > > skb_frag_t *frag = skb_shinfo(head_skb)->frags; > > unsigned int mss = skb_shinfo(head_skb)->gso_size; > > unsigned int doffset = head_skb->data - skb_mac_header(head_skb); > > - struct sk_buff *frag_skb = head_skb; > > unsigned int offset = doffset; > > unsigned int tnl_hlen = skb_tnl_header_len(head_skb); > > unsigned int headroom; > > @@ -2913,7 +2920,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > > i = 0; > > nfrags = skb_shinfo(list_skb)->nr_frags; > > frag = skb_shinfo(list_skb)->frags; > > - frag_skb = list_skb; > > pos += skb_headlen(list_skb); > > > > while (pos < offset + len) { > > @@ -2975,6 +2981,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > > nskb->data - tnl_hlen, > > doffset + tnl_hlen); > > > > + if (skb_shinfo(head_skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { > > + skb_shinfo(nskb)->zcopy_src = head_skb; > > + atomic_inc(&head_skb->users); > > + } > > + > > if (nskb->len == len + doffset) > > goto perform_csum_check; > > > > @@ -3001,7 +3012,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > > i = 0; > > nfrags = skb_shinfo(list_skb)->nr_frags; > > frag = skb_shinfo(list_skb)->frags; > > - frag_skb = list_skb; > > > > BUG_ON(!nfrags); > > > > @@ -3016,8 +3026,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > > goto err; > > } > > > > - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) > > - goto err; > > > > *nskb_frag = *frag; > > __skb_frag_ref(nskb_frag); > > -- > > 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, May 20, 2014 at 5:28 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > On Tue, 2014-05-20 at 14:24 +0300, Igor Royzis wrote: >> Fix accessing GSO fragments memory (and a possible corruption therefore) after >> reporting completion in a zero copy callback. The previous fix in the commit 1fd819ec >> orphaned frags which eliminates zero copy advantages. The fix makes the completion >> called after all the fragments were processed avoiding unnecessary orphaning/copying >> from userspace. >> >> The GSO fragments corruption issue was observed in a typical QEMU/KVM VM setup that >> hosts a Windows guest (since QEMU virtio-net Windows driver doesn't support GRO). >> The fix has been verified by running the HCK OffloadLSO test. >> >> Signed-off-by: Igor Royzis <igorr@swortex.com> >> Signed-off-by: Anton Nayshtut <anton@swortex.com> >> --- >> include/linux/skbuff.h | 1 + >> net/core/skbuff.c | 18 +++++++++++++----- >> 2 files changed, 14 insertions(+), 5 deletions(-) >> >> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> index 08074a8..8c49edc 100644 >> --- a/include/linux/skbuff.h >> +++ b/include/linux/skbuff.h >> @@ -287,6 +287,7 @@ struct skb_shared_info { >> struct sk_buff *frag_list; >> struct skb_shared_hwtstamps hwtstamps; >> __be32 ip6_frag_id; >> + struct sk_buff *zcopy_src; >> > > Before your patch : > > sizeof(struct skb_shared_info)=0x140 > offsetof(struct skb_shared_info, frags[1])=0x40 > > SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) -> 0x140 > > After your patch : > > sizeof(struct skb_shared_info)=0x148 > offsetof(struct skb_shared_info, frags[1])=0x48 > > SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) -> 0x180 > > Thats a serious bump, because it increases all skb truesizes, and > typical skb with one fragment will use 2 cache lines instead of one in > struct skb_shared_info, so this adds memory pressure in fast path. > > So while this patch might increase performance for some workloads, > it generally decreases performance on many others. Would it "ease" the memory cache penalty if we moved the parent fragment pointer from skb_shared_info to skbuff itself? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 08074a8..8c49edc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -287,6 +287,7 @@ struct skb_shared_info { struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; __be32 ip6_frag_id; + struct sk_buff *zcopy_src; /* * Warning : all fields before dataref are cleared in __alloc_skb() diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1b62343..6fa6342 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -610,14 +610,18 @@ EXPORT_SYMBOL(__kfree_skb); */ void kfree_skb(struct sk_buff *skb) { + struct sk_buff *zcopy_src; if (unlikely(!skb)) return; if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + zcopy_src = skb_shinfo(skb)->zcopy_src; trace_kfree_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); + if (unlikely(zcopy_src)) + kfree_skb(zcopy_src); } EXPORT_SYMBOL(kfree_skb); @@ -662,14 +666,18 @@ EXPORT_SYMBOL(skb_tx_error); */ void consume_skb(struct sk_buff *skb) { + struct sk_buff *zcopy_src; if (unlikely(!skb)) return; if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + zcopy_src = skb_shinfo(skb)->zcopy_src; trace_consume_skb(skb); __kfree_skb(skb); + if (unlikely(zcopy_src)) + consume_skb(zcopy_src); } EXPORT_SYMBOL(consume_skb); @@ -2867,7 +2875,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); - struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int headroom; @@ -2913,7 +2920,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; - frag_skb = list_skb; pos += skb_headlen(list_skb); while (pos < offset + len) { @@ -2975,6 +2981,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, nskb->data - tnl_hlen, doffset + tnl_hlen); + if (skb_shinfo(head_skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { + skb_shinfo(nskb)->zcopy_src = head_skb; + atomic_inc(&head_skb->users); + } + if (nskb->len == len + doffset) goto perform_csum_check; @@ -3001,7 +3012,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; - frag_skb = list_skb; BUG_ON(!nfrags); @@ -3016,8 +3026,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, goto err; } - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) - goto err; *nskb_frag = *frag; __skb_frag_ref(nskb_frag);