diff mbox series

[net-next,v6,1/2] net: permit skb_segment on head_frag frag_list skb

Message ID 20180321233104.2142764-2-yhs@fb.com
State Accepted, archived
Delegated to: David Miller
Headers show
Series net: permit skb_segment on head_frag frag_list skb | expand

Commit Message

Yonghong Song March 21, 2018, 11:31 p.m. UTC
One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
function skb_segment(), line 3667. The bpf program attaches to
clsact ingress, calls bpf_skb_change_proto to change protocol
from ipv4 to ipv6 or from ipv6 to ipv4, and then calls bpf_redirect
to send the changed packet out.

3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
3473                             netdev_features_t features)
3474 {
3475         struct sk_buff *segs = NULL;
3476         struct sk_buff *tail = NULL;
...
3665                 while (pos < offset + len) {
3666                         if (i >= nfrags) {
3667                                 BUG_ON(skb_headlen(list_skb));
3668
3669                                 i = 0;
3670                                 nfrags = skb_shinfo(list_skb)->nr_frags;
3671                                 frag = skb_shinfo(list_skb)->frags;
3672                                 frag_skb = list_skb;
...

call stack:
...
 #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
 #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
 #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
 #4 [ffff883ffef03668] die at ffffffff8101deb2
 #5 [ffff883ffef03698] do_trap at ffffffff8101a700
 #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
 #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
 #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
    [exception RIP: skb_segment+3044]
    RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
    RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
    RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
    RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
    R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
    R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7
--- <IRQ stack> ---
...

The triggering input skb has the following properties:
    list_skb = skb->frag_list;
    skb->nfrags != NULL && skb_headlen(list_skb) != 0
and skb_segment() is not able to handle a frag_list skb
if its headlen (list_skb->len - list_skb->data_len) is not 0.

This patch addressed the issue by handling skb_headlen(list_skb) != 0
case properly if list_skb->head_frag is true, which is expected in
most cases. The head frag is processed before list_skb->frags
are processed.

Reported-by: Diptanu Gon Choudhury <diptanu@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 net/core/skbuff.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

Comments

Alexander H Duyck March 22, 2018, 12:32 a.m. UTC | #1
On Wed, Mar 21, 2018 at 4:31 PM, Yonghong Song <yhs@fb.com> wrote:
> One of our in-house projects, bpf-based NAT, hits a kernel BUG_ON at
> function skb_segment(), line 3667. The bpf program attaches to
> clsact ingress, calls bpf_skb_change_proto to change protocol
> from ipv4 to ipv6 or from ipv6 to ipv4, and then calls bpf_redirect
> to send the changed packet out.
>
> 3472 struct sk_buff *skb_segment(struct sk_buff *head_skb,
> 3473                             netdev_features_t features)
> 3474 {
> 3475         struct sk_buff *segs = NULL;
> 3476         struct sk_buff *tail = NULL;
> ...
> 3665                 while (pos < offset + len) {
> 3666                         if (i >= nfrags) {
> 3667                                 BUG_ON(skb_headlen(list_skb));
> 3668
> 3669                                 i = 0;
> 3670                                 nfrags = skb_shinfo(list_skb)->nr_frags;
> 3671                                 frag = skb_shinfo(list_skb)->frags;
> 3672                                 frag_skb = list_skb;
> ...
>
> call stack:
> ...
>  #1 [ffff883ffef03558] __crash_kexec at ffffffff8110c525
>  #2 [ffff883ffef03620] crash_kexec at ffffffff8110d5cc
>  #3 [ffff883ffef03640] oops_end at ffffffff8101d7e7
>  #4 [ffff883ffef03668] die at ffffffff8101deb2
>  #5 [ffff883ffef03698] do_trap at ffffffff8101a700
>  #6 [ffff883ffef036e8] do_error_trap at ffffffff8101abfe
>  #7 [ffff883ffef037a0] do_invalid_op at ffffffff8101acd0
>  #8 [ffff883ffef037b0] invalid_op at ffffffff81a00bab
>     [exception RIP: skb_segment+3044]
>     RIP: ffffffff817e4dd4  RSP: ffff883ffef03860  RFLAGS: 00010216
>     RAX: 0000000000002bf6  RBX: ffff883feb7aaa00  RCX: 0000000000000011
>     RDX: ffff883fb87910c0  RSI: 0000000000000011  RDI: ffff883feb7ab500
>     RBP: ffff883ffef03928   R8: 0000000000002ce2   R9: 00000000000027da
>     R10: 000001ea00000000  R11: 0000000000002d82  R12: ffff883f90a1ee80
>     R13: ffff883fb8791120  R14: ffff883feb7abc00  R15: 0000000000002ce2
>     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
>  #9 [ffff883ffef03930] tcp_gso_segment at ffffffff818713e7
> --- <IRQ stack> ---
> ...
>
> The triggering input skb has the following properties:
>     list_skb = skb->frag_list;
>     skb->nfrags != NULL && skb_headlen(list_skb) != 0
> and skb_segment() is not able to handle a frag_list skb
> if its headlen (list_skb->len - list_skb->data_len) is not 0.
>
> This patch addressed the issue by handling skb_headlen(list_skb) != 0
> case properly if list_skb->head_frag is true, which is expected in
> most cases. The head frag is processed before list_skb->frags
> are processed.
>
> Reported-by: Diptanu Gon Choudhury <diptanu@fb.com>
> Signed-off-by: Yonghong Song <yhs@fb.com>

This looks good to me.

Reviewed-by: Alexander Duyck <alexander.h.duyck@intel.com>
diff mbox series

Patch

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 715c134..4e1d4e7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3460,6 +3460,19 @@  void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
 }
 EXPORT_SYMBOL_GPL(skb_pull_rcsum);
 
+static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
+{
+	skb_frag_t head_frag;
+	struct page *page;
+
+	page = virt_to_head_page(frag_skb->head);
+	head_frag.page.p = page;
+	head_frag.page_offset = frag_skb->data -
+		(unsigned char *)page_address(page);
+	head_frag.size = skb_headlen(frag_skb);
+	return head_frag;
+}
+
 /**
  *	skb_segment - Perform protocol segmentation on skb.
  *	@head_skb: buffer to segment
@@ -3664,15 +3677,19 @@  struct sk_buff *skb_segment(struct sk_buff *head_skb,
 
 		while (pos < offset + len) {
 			if (i >= nfrags) {
-				BUG_ON(skb_headlen(list_skb));
-
 				i = 0;
 				nfrags = skb_shinfo(list_skb)->nr_frags;
 				frag = skb_shinfo(list_skb)->frags;
 				frag_skb = list_skb;
+				if (!skb_headlen(list_skb)) {
+					BUG_ON(!nfrags);
+				} else {
+					BUG_ON(!list_skb->head_frag);
 
-				BUG_ON(!nfrags);
-
+					/* to make room for head_frag. */
+					i--;
+					frag--;
+				}
 				if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
 				    skb_zerocopy_clone(nskb, frag_skb,
 						       GFP_ATOMIC))
@@ -3689,7 +3706,7 @@  struct sk_buff *skb_segment(struct sk_buff *head_skb,
 				goto err;
 			}
 
-			*nskb_frag = *frag;
+			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
 			__skb_frag_ref(nskb_frag);
 			size = skb_frag_size(nskb_frag);