Message ID | 20160823052030.GI3735@gauss.secunet.com |
---|---|
State | Changes Requested, archived |
Delegated to: | David Miller |
Headers | show |
On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert <steffen.klassert@secunet.com> wrote: > Since commit 8a29111c7 ("net: gro: allow to build full sized skb") > gro may build buffers with a frag_list. This can hurt forwarding > because most NICs can't offload such packets, they need to be > segmented in software. This patch splits buffers with a frag_list > at the frag_list pointer into buffers that can be TSO offloaded. > > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> > --- > net/core/skbuff.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- > net/ipv4/af_inet.c | 7 ++-- > net/ipv4/gre_offload.c | 7 +++- > net/ipv4/tcp_offload.c | 3 ++ > net/ipv4/udp_offload.c | 9 +++-- > net/ipv6/ip6_offload.c | 6 +++- > 6 files changed, 114 insertions(+), 7 deletions(-) > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 3864b4b6..a614e9d 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > sg = !!(features & NETIF_F_SG); > csum = !!can_checksum_protocol(features, proto); > > + headroom = skb_headroom(head_skb); > + > + if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) && > + csum && sg && (mss != GSO_BY_FRAGS) && > + !(features & NETIF_F_GSO_PARTIAL)) { Does this really need to be mutually exclusive with NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS? This is occurring early enough that maybe instead of doubling the size of skb_segment you should look at instead adding a new static function that could handle splitting the frag_list and just call that instead of adding this massive amount of code. Some of these checks are more expensive than others. I would recommend doing the sg && csum && !(features & NETIF_F_GSO_PARTIAL) checks first. If possible you could even combine some of the checks since they are also in the block that sets up partial_segs. That way we can cut down on the total number of conditional branches needed. > + unsigned int lskb_segs; > + unsigned int delta_segs, delta_len, delta_truesize; > + struct sk_buff *nskb; > + delta_segs = delta_len = delta_truesize = 0; > + > + segs = __alloc_skb(skb_headlen(head_skb) + headroom, > + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), > + NUMA_NO_NODE); > + if (unlikely(!segs)) > + return ERR_PTR(-ENOMEM); > + > + skb_reserve(segs, headroom); > + skb_put(segs, skb_headlen(head_skb)); > + skb_copy_from_linear_data(head_skb, segs->data, segs->len); > + copy_skb_header(segs, head_skb); > + > + if (skb_shinfo(head_skb)->nr_frags) { > + int i; > + > + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) > + goto err; > + > + for (i = 0; i < skb_shinfo(head_skb)->nr_frags; i++) { > + skb_shinfo(segs)->frags[i] = skb_shinfo(head_skb)->frags[i]; > + skb_frag_ref(head_skb, i); > + } > + skb_shinfo(segs)->nr_frags = i; > + } > + > + do { > + nskb = skb_clone(list_skb, GFP_ATOMIC); > + if (unlikely(!nskb)) > + goto err; > + > + list_skb = list_skb->next; > + > + if (!tail) > + segs->next = nskb; > + else > + tail->next = nskb; > + > + tail = nskb; > + > + if (skb_cow_head(nskb, doffset + headroom)) > + goto err; > + > + lskb_segs = nskb->len / mss; > + > + skb_shinfo(nskb)->gso_size = mss; > + skb_shinfo(nskb)->gso_type = skb_shinfo(head_skb)->gso_type; > + skb_shinfo(nskb)->gso_segs = lskb_segs; > + > + > + delta_segs += lskb_segs; > + delta_len += nskb->len; > + delta_truesize += nskb->truesize; > + > + __skb_push(nskb, doffset); > + > + skb_release_head_state(nskb); > + __copy_skb_header(nskb, head_skb); > + > + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); > + skb_reset_mac_len(nskb); > + > + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, > + nskb->data - tnl_hlen, > + doffset + tnl_hlen); > + > + } while (list_skb); > + > + skb_shinfo(segs)->gso_segs -= delta_segs; > + segs->len = head_skb->len - delta_len; > + segs->data_len = head_skb->data_len - delta_len; > + segs->truesize += head_skb->data_len - delta_truesize; > + > + segs->prev = tail; > + > + goto out; > + } > + > /* GSO partial only requires that we trim off any excess that > * doesn't fit into an MSS sized block, so take care of that > * now. > @@ -3090,7 +3176,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > partial_segs = 0; > } > > - headroom = skb_headroom(head_skb); > pos = skb_headlen(head_skb); > > do { > @@ -3307,6 +3392,8 @@ perform_csum_check: > swap(tail->destructor, head_skb->destructor); > swap(tail->sk, head_skb->sk); > } > + > +out: > return segs; > > err: > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index 55513e6..c814afa 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -1195,7 +1195,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); > struct sk_buff *inet_gso_segment(struct sk_buff *skb, > netdev_features_t features) > { > - bool udpfrag = false, fixedid = false, encap; > + bool udpfrag = false, fixedid = false, gso_partial = false, encap; > struct sk_buff *segs = ERR_PTR(-EINVAL); > const struct net_offload *ops; > unsigned int offset = 0; > @@ -1248,6 +1248,9 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, > if (IS_ERR_OR_NULL(segs)) > goto out; > > + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) > + gso_partial = true; > + For these kind of blocks it is usually best to just do: gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); The compiler usually does a better job of just doing a bit of arithmetic instead of generating a set of test/jump type instructions and generally that runs faster since there is less branching. The same applies to all the other cases where you setup gso_partial this way. > skb = segs; > do { > iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); > @@ -1257,7 +1260,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, > iph->frag_off |= htons(IP_MF); > offset += skb->len - nhoff - ihl; > tot_len = skb->len - nhoff; > - } else if (skb_is_gso(skb)) { > + } else if (skb_is_gso(skb) && gso_partial) { > if (!fixedid) { > iph->id = htons(id); > id += skb_shinfo(skb)->gso_segs; > diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c > index ecd1e09..cf82e28 100644 > --- a/net/ipv4/gre_offload.c > +++ b/net/ipv4/gre_offload.c > @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, > __be16 protocol = skb->protocol; > u16 mac_len = skb->mac_len; > int gre_offset, outer_hlen; > - bool need_csum, ufo; > + bool need_csum, ufo, gso_partial; > > if (!skb->encapsulation) > goto out; > @@ -69,6 +69,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, > goto out; > } > > + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) > + gso_partial = true; > + else > + gso_partial = false; > + > outer_hlen = skb_tnl_header_len(skb); > gre_offset = outer_hlen - tnl_hlen; > skb = segs; > diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c > index 5c59649..dddd227 100644 > --- a/net/ipv4/tcp_offload.c > +++ b/net/ipv4/tcp_offload.c > @@ -107,6 +107,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, > > /* Only first segment might have ooo_okay set */ > segs->ooo_okay = ooo_okay; > + if (skb_is_gso(segs) && !(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)) > + mss = (skb_tail_pointer(segs) - skb_transport_header(segs)) + > + segs->data_len - thlen; > > delta = htonl(oldlen + (thlen + mss)); > > diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c > index 81f253b..dfb6a2c 100644 > --- a/net/ipv4/udp_offload.c > +++ b/net/ipv4/udp_offload.c > @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, > __be16 new_protocol, bool is_ipv6) > { > int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); > - bool remcsum, need_csum, offload_csum, ufo; > + bool remcsum, need_csum, offload_csum, ufo, gso_partial; > struct sk_buff *segs = ERR_PTR(-EINVAL); > struct udphdr *uh = udp_hdr(skb); > u16 mac_offset = skb->mac_header; > @@ -88,6 +88,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, > goto out; > } > > + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) > + gso_partial = true; > + else > + gso_partial = false; > + > outer_hlen = skb_tnl_header_len(skb); > udp_offset = outer_hlen - tnl_hlen; > skb = segs; > @@ -117,7 +122,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, > * will be using a length value equal to only one MSS sized > * segment instead of the entire frame. > */ > - if (skb_is_gso(skb)) { > + if (skb_is_gso(skb) && gso_partial) { > uh->len = htons(skb_shinfo(skb)->gso_size + > SKB_GSO_CB(skb)->data_offset + > skb->head - (unsigned char *)uh); > diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c > index 22e90e5..0ec16ba 100644 > --- a/net/ipv6/ip6_offload.c > +++ b/net/ipv6/ip6_offload.c > @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, > int offset = 0; > bool encap, udpfrag; > int nhoff; > + bool gso_partial = false; > > skb_reset_network_header(skb); > nhoff = skb_network_header(skb) - skb_mac_header(skb); > @@ -101,9 +102,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, > if (IS_ERR(segs)) > goto out; > > + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) > + gso_partial = true; > + > for (skb = segs; skb; skb = skb->next) { > ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); > - if (skb_is_gso(skb)) > + if (skb_is_gso(skb) && gso_partial) > payload_len = skb_shinfo(skb)->gso_size + > SKB_GSO_CB(skb)->data_offset + > skb->head - (unsigned char *)(ipv6h + 1); > -- > 1.9.1 >
On Tue, Aug 23, 2016 at 07:47:32AM -0700, Alexander Duyck wrote: > On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert > <steffen.klassert@secunet.com> wrote: > > Since commit 8a29111c7 ("net: gro: allow to build full sized skb") > > gro may build buffers with a frag_list. This can hurt forwarding > > because most NICs can't offload such packets, they need to be > > segmented in software. This patch splits buffers with a frag_list > > at the frag_list pointer into buffers that can be TSO offloaded. > > > > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> > > --- > > net/core/skbuff.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- > > net/ipv4/af_inet.c | 7 ++-- > > net/ipv4/gre_offload.c | 7 +++- > > net/ipv4/tcp_offload.c | 3 ++ > > net/ipv4/udp_offload.c | 9 +++-- > > net/ipv6/ip6_offload.c | 6 +++- > > 6 files changed, 114 insertions(+), 7 deletions(-) > > > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > > index 3864b4b6..a614e9d 100644 > > --- a/net/core/skbuff.c > > +++ b/net/core/skbuff.c > > @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, > > sg = !!(features & NETIF_F_SG); > > csum = !!can_checksum_protocol(features, proto); > > > > + headroom = skb_headroom(head_skb); > > + > > + if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) && > > + csum && sg && (mss != GSO_BY_FRAGS) && > > + !(features & NETIF_F_GSO_PARTIAL)) { > > Does this really need to be mutually exclusive with > NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS? It should be possible to extend this to NETIF_F_GSO_PARTIAL but I have no test for this. Regarding GSO_BY_FRAGS, this is rather new and just used for sctp. I don't know what sctp does with GSO_BY_FRAGS. > This is occurring early enough > that maybe instead of doubling the size of skb_segment you should look > at instead adding a new static function that could handle splitting > the frag_list and just call that instead of adding this massive amount > of code. Ok, will do that. > > Some of these checks are more expensive than others. I would > recommend doing the sg && csum && !(features & NETIF_F_GSO_PARTIAL) > checks first. If possible you could even combine some of the checks > since they are also in the block that sets up partial_segs. That way > we can cut down on the total number of conditional branches needed. We can combine the sg && csum check in the block that sets up partial_segs. In case this is not NETIF_F_GSO_PARTIAL, I'll do the list_skb and net_gso_ok() check and call the new static function then. > > > > + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) > > + gso_partial = true; > > + > > For these kind of blocks it is usually best to just do: > gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); > > The compiler usually does a better job of just doing a bit of > arithmetic instead of generating a set of test/jump type instructions > and generally that runs faster since there is less branching. The > same applies to all the other cases where you setup gso_partial this > way. Good point, I'll change this. Thanks for the review!
On Wed, Aug 24, 2016 at 2:32 AM, Steffen Klassert <steffen.klassert@secunet.com> wrote: > On Tue, Aug 23, 2016 at 07:47:32AM -0700, Alexander Duyck wrote: >> On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert >> <steffen.klassert@secunet.com> wrote: >> > Since commit 8a29111c7 ("net: gro: allow to build full sized skb") >> > gro may build buffers with a frag_list. This can hurt forwarding >> > because most NICs can't offload such packets, they need to be >> > segmented in software. This patch splits buffers with a frag_list >> > at the frag_list pointer into buffers that can be TSO offloaded. >> > >> > Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> >> > --- >> > net/core/skbuff.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- >> > net/ipv4/af_inet.c | 7 ++-- >> > net/ipv4/gre_offload.c | 7 +++- >> > net/ipv4/tcp_offload.c | 3 ++ >> > net/ipv4/udp_offload.c | 9 +++-- >> > net/ipv6/ip6_offload.c | 6 +++- >> > 6 files changed, 114 insertions(+), 7 deletions(-) >> > >> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c >> > index 3864b4b6..a614e9d 100644 >> > --- a/net/core/skbuff.c >> > +++ b/net/core/skbuff.c >> > @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >> > sg = !!(features & NETIF_F_SG); >> > csum = !!can_checksum_protocol(features, proto); >> > >> > + headroom = skb_headroom(head_skb); >> > + >> > + if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) && >> > + csum && sg && (mss != GSO_BY_FRAGS) && >> > + !(features & NETIF_F_GSO_PARTIAL)) { >> >> Does this really need to be mutually exclusive with >> NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS? > > It should be possible to extend this to NETIF_F_GSO_PARTIAL but > I have no test for this. Regarding GSO_BY_FRAGS, this is rather > new and just used for sctp. I don't know what sctp does with > GSO_BY_FRAGS. I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS functionality better than I could since he is the original author. If I recall GSO_BY_FRAGS does something similar to what you are doing, although I believe it doesn't carry any data in the first buffer other than just a header. I believe the idea behind GSO_BY_FRAGS was to allow for segmenting a frame at the frag_list level instead of having it done just based on MSS. That was the only reason why I brought it up. In you case though we maybe be able to make this easier. If I am not mistaken I believe we should have the main skb, and any in the chain excluding the last containing the same amount of data. That being the case we should be able to determine the size that you would need to segment at by taking skb->len, and removing the length of all the skbuffs hanging off of frag_list. At that point you just use that as your MSS for segmentation and it should break things up so that you have a series of equal sized segments split as the frag_list buffer boundaries. After that all that is left is to update the gso info for the buffers. For GSO_PARTIAL I was handling that on the first segment only. For this change you would need to update that code to address the fact that you would have to determine the number of segments on the first frame and the last since the last could be less than the first, but all of the others in-between should have the same number of segments. - Alex
Em 24-08-2016 13:27, Alexander Duyck escreveu: > On Wed, Aug 24, 2016 at 2:32 AM, Steffen Klassert > <steffen.klassert@secunet.com> wrote: >> On Tue, Aug 23, 2016 at 07:47:32AM -0700, Alexander Duyck wrote: >>> On Mon, Aug 22, 2016 at 10:20 PM, Steffen Klassert >>> <steffen.klassert@secunet.com> wrote: >>>> Since commit 8a29111c7 ("net: gro: allow to build full sized skb") >>>> gro may build buffers with a frag_list. This can hurt forwarding >>>> because most NICs can't offload such packets, they need to be >>>> segmented in software. This patch splits buffers with a frag_list >>>> at the frag_list pointer into buffers that can be TSO offloaded. >>>> >>>> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> >>>> --- >>>> net/core/skbuff.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- >>>> net/ipv4/af_inet.c | 7 ++-- >>>> net/ipv4/gre_offload.c | 7 +++- >>>> net/ipv4/tcp_offload.c | 3 ++ >>>> net/ipv4/udp_offload.c | 9 +++-- >>>> net/ipv6/ip6_offload.c | 6 +++- >>>> 6 files changed, 114 insertions(+), 7 deletions(-) >>>> >>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c >>>> index 3864b4b6..a614e9d 100644 >>>> --- a/net/core/skbuff.c >>>> +++ b/net/core/skbuff.c >>>> @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, >>>> sg = !!(features & NETIF_F_SG); >>>> csum = !!can_checksum_protocol(features, proto); >>>> >>>> + headroom = skb_headroom(head_skb); >>>> + >>>> + if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) && >>>> + csum && sg && (mss != GSO_BY_FRAGS) && >>>> + !(features & NETIF_F_GSO_PARTIAL)) { >>> >>> Does this really need to be mutually exclusive with >>> NETIF_F_GSO_PARTIAL and GSO_BY_FRAGS? >> >> It should be possible to extend this to NETIF_F_GSO_PARTIAL but >> I have no test for this. Regarding GSO_BY_FRAGS, this is rather >> new and just used for sctp. I don't know what sctp does with >> GSO_BY_FRAGS. > > I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS > functionality better than I could since he is the original author. > > If I recall GSO_BY_FRAGS does something similar to what you are doing, > although I believe it doesn't carry any data in the first buffer other > than just a header. I believe the idea behind GSO_BY_FRAGS was to > allow for segmenting a frame at the frag_list level instead of having > it done just based on MSS. That was the only reason why I brought it > up. > That's exactly it. On this no data in the first buffer limitation, we probably can allow it have some data in there. It was done this way just because sctp is using skb_gro_receive() to build such skb and this was the way I found to get such frag_list skb generated by it, thus preserving frame boundaries. For using GSO_BY_FRAGS in gso_size, it's how skb_is_gso() returns true, but it's similar to the SKB_GSO_PARTIAL rationale in here. We can make sctp also flag it as SKB_GSO_PARTIAL if needed I guess, in case you need to maintain gso_size value. Marcelo > In you case though we maybe be able to make this easier. If I am not > mistaken I believe we should have the main skb, and any in the chain > excluding the last containing the same amount of data. That being the > case we should be able to determine the size that you would need to > segment at by taking skb->len, and removing the length of all the > skbuffs hanging off of frag_list. At that point you just use that as > your MSS for segmentation and it should break things up so that you > have a series of equal sized segments split as the frag_list buffer > boundaries. > > After that all that is left is to update the gso info for the buffers. > For GSO_PARTIAL I was handling that on the first segment only. For > this change you would need to update that code to address the fact > that you would have to determine the number of segments on the first > frame and the last since the last could be less than the first, but > all of the others in-between should have the same number of segments. > > - Alex >
On Wed, Aug 24, 2016 at 02:25:29PM -0300, Marcelo Ricardo Leitner wrote: > Em 24-08-2016 13:27, Alexander Duyck escreveu: > > > >I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS > >functionality better than I could since he is the original author. > > > >If I recall GSO_BY_FRAGS does something similar to what you are doing, > >although I believe it doesn't carry any data in the first buffer other > >than just a header. I believe the idea behind GSO_BY_FRAGS was to > >allow for segmenting a frame at the frag_list level instead of having > >it done just based on MSS. That was the only reason why I brought it > >up. > > > > That's exactly it. > > On this no data in the first buffer limitation, we probably can > allow it have some data in there. It was done this way just because > sctp is using skb_gro_receive() to build such skb and this was the > way I found to get such frag_list skb generated by it, thus > preserving frame boundaries. Just to understand what you are doing. You generate MTU sized linear buffers in sctp and then, skb_gro_receive() chains up these buffers at the frag_list pointer. skb_gro_receive() does this because skb_gro_offset is null and skb->head_frag is not set in your case. At segmentation, you just need to split at the frag_list pointer because you know that the chained buffers fit the MTU, right?
On Thu, Aug 25, 2016 at 09:31:26AM +0200, Steffen Klassert wrote: > On Wed, Aug 24, 2016 at 02:25:29PM -0300, Marcelo Ricardo Leitner wrote: > > Em 24-08-2016 13:27, Alexander Duyck escreveu: > > > > > >I'm adding Marcelo as he could probably explain the GSO_BY_FRAGS > > >functionality better than I could since he is the original author. > > > > > >If I recall GSO_BY_FRAGS does something similar to what you are doing, > > >although I believe it doesn't carry any data in the first buffer other > > >than just a header. I believe the idea behind GSO_BY_FRAGS was to > > >allow for segmenting a frame at the frag_list level instead of having > > >it done just based on MSS. That was the only reason why I brought it > > >up. > > > > > > > That's exactly it. > > > > On this no data in the first buffer limitation, we probably can > > allow it have some data in there. It was done this way just because > > sctp is using skb_gro_receive() to build such skb and this was the > > way I found to get such frag_list skb generated by it, thus > > preserving frame boundaries. > > Just to understand what you are doing. You generate MTU sized linear > buffers in sctp and then, skb_gro_receive() chains up these buffers > at the frag_list pointer. skb_gro_receive() does this because > skb_gro_offset is null and skb->head_frag is not set in your case. > > At segmentation, you just need to split at the frag_list pointer > because you know that the chained buffers fit the MTU, right? > Correct. Just note that these buffers fit the MTU, but not necessary uses all of it. That is main point in here, variable segmentation size.
On Thu, Aug 25, 2016 at 09:17:18AM -0300, Marcelo Ricardo Leitner wrote: > On Thu, Aug 25, 2016 at 09:31:26AM +0200, Steffen Klassert wrote: > > > > Just to understand what you are doing. You generate MTU sized linear > > buffers in sctp and then, skb_gro_receive() chains up these buffers > > at the frag_list pointer. skb_gro_receive() does this because > > skb_gro_offset is null and skb->head_frag is not set in your case. > > > > At segmentation, you just need to split at the frag_list pointer > > because you know that the chained buffers fit the MTU, right? > > > > Correct. Just note that these buffers fit the MTU, but not necessary > uses all of it. That is main point in here, variable segmentation size. Thanks for the info. This is a very interesting concept, maybe I can use it for IPsec too.
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3864b4b6..a614e9d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3078,6 +3078,92 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); + headroom = skb_headroom(head_skb); + + if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) && + csum && sg && (mss != GSO_BY_FRAGS) && + !(features & NETIF_F_GSO_PARTIAL)) { + unsigned int lskb_segs; + unsigned int delta_segs, delta_len, delta_truesize; + struct sk_buff *nskb; + delta_segs = delta_len = delta_truesize = 0; + + segs = __alloc_skb(skb_headlen(head_skb) + headroom, + GFP_ATOMIC, skb_alloc_rx_flag(head_skb), + NUMA_NO_NODE); + if (unlikely(!segs)) + return ERR_PTR(-ENOMEM); + + skb_reserve(segs, headroom); + skb_put(segs, skb_headlen(head_skb)); + skb_copy_from_linear_data(head_skb, segs->data, segs->len); + copy_skb_header(segs, head_skb); + + if (skb_shinfo(head_skb)->nr_frags) { + int i; + + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + goto err; + + for (i = 0; i < skb_shinfo(head_skb)->nr_frags; i++) { + skb_shinfo(segs)->frags[i] = skb_shinfo(head_skb)->frags[i]; + skb_frag_ref(head_skb, i); + } + skb_shinfo(segs)->nr_frags = i; + } + + do { + nskb = skb_clone(list_skb, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + + list_skb = list_skb->next; + + if (!tail) + segs->next = nskb; + else + tail->next = nskb; + + tail = nskb; + + if (skb_cow_head(nskb, doffset + headroom)) + goto err; + + lskb_segs = nskb->len / mss; + + skb_shinfo(nskb)->gso_size = mss; + skb_shinfo(nskb)->gso_type = skb_shinfo(head_skb)->gso_type; + skb_shinfo(nskb)->gso_segs = lskb_segs; + + + delta_segs += lskb_segs; + delta_len += nskb->len; + delta_truesize += nskb->truesize; + + __skb_push(nskb, doffset); + + skb_release_head_state(nskb); + __copy_skb_header(nskb, head_skb); + + skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom); + skb_reset_mac_len(nskb); + + skb_copy_from_linear_data_offset(head_skb, -tnl_hlen, + nskb->data - tnl_hlen, + doffset + tnl_hlen); + + } while (list_skb); + + skb_shinfo(segs)->gso_segs -= delta_segs; + segs->len = head_skb->len - delta_len; + segs->data_len = head_skb->data_len - delta_len; + segs->truesize += head_skb->data_len - delta_truesize; + + segs->prev = tail; + + goto out; + } + /* GSO partial only requires that we trim off any excess that * doesn't fit into an MSS sized block, so take care of that * now. @@ -3090,7 +3176,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, partial_segs = 0; } - headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); do { @@ -3307,6 +3392,8 @@ perform_csum_check: swap(tail->destructor, head_skb->destructor); swap(tail->sk, head_skb->sk); } + +out: return segs; err: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 55513e6..c814afa 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1195,7 +1195,7 @@ EXPORT_SYMBOL(inet_sk_rebuild_header); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { - bool udpfrag = false, fixedid = false, encap; + bool udpfrag = false, fixedid = false, gso_partial = false, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; @@ -1248,6 +1248,9 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, if (IS_ERR_OR_NULL(segs)) goto out; + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) + gso_partial = true; + skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); @@ -1257,7 +1260,7 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; tot_len = skb->len - nhoff; - } else if (skb_is_gso(skb)) { + } else if (skb_is_gso(skb) && gso_partial) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index ecd1e09..cf82e28 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -24,7 +24,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; - bool need_csum, ufo; + bool need_csum, ufo, gso_partial; if (!skb->encapsulation) goto out; @@ -69,6 +69,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, goto out; } + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) + gso_partial = true; + else + gso_partial = false; + outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 5c59649..dddd227 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -107,6 +107,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, /* Only first segment might have ooo_okay set */ segs->ooo_okay = ooo_okay; + if (skb_is_gso(segs) && !(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL)) + mss = (skb_tail_pointer(segs) - skb_transport_header(segs)) + + segs->data_len - thlen; delta = htonl(oldlen + (thlen + mss)); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 81f253b..dfb6a2c 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -21,7 +21,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, __be16 new_protocol, bool is_ipv6) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - bool remcsum, need_csum, offload_csum, ufo; + bool remcsum, need_csum, offload_csum, ufo, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; @@ -88,6 +88,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, goto out; } + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) + gso_partial = true; + else + gso_partial = false; + outer_hlen = skb_tnl_header_len(skb); udp_offset = outer_hlen - tnl_hlen; skb = segs; @@ -117,7 +122,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (skb_is_gso(skb)) { + if (skb_is_gso(skb) && gso_partial) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 22e90e5..0ec16ba 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -69,6 +69,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int offset = 0; bool encap, udpfrag; int nhoff; + bool gso_partial = false; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); @@ -101,9 +102,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, if (IS_ERR(segs)) goto out; + if (skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL) + gso_partial = true; + for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (skb_is_gso(skb)) + if (skb_is_gso(skb) && gso_partial) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1);
Since commit 8a29111c7 ("net: gro: allow to build full sized skb") gro may build buffers with a frag_list. This can hurt forwarding because most NICs can't offload such packets, they need to be segmented in software. This patch splits buffers with a frag_list at the frag_list pointer into buffers that can be TSO offloaded. Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com> --- net/core/skbuff.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++- net/ipv4/af_inet.c | 7 ++-- net/ipv4/gre_offload.c | 7 +++- net/ipv4/tcp_offload.c | 3 ++ net/ipv4/udp_offload.c | 9 +++-- net/ipv6/ip6_offload.c | 6 +++- 6 files changed, 114 insertions(+), 7 deletions(-)