diff mbox

[ovs-dev] datapath: fix skb_panic due to the incorrect actions attrlen

Message ID 1502923730-30062-1-git-send-email-gvrose8192@gmail.com
State Superseded
Headers show

Commit Message

Gregory Rose Aug. 16, 2017, 10:48 p.m. UTC
Upstream commit:
    commit 494bea39f3201776cdfddc232705f54a0bd210c4
    Author: Liping Zhang <zlpnobody@gmail.com>
    Date:   Wed Aug 16 13:30:07 2017 +0800

    For sw_flow_actions, the actions_len only represents the kernel part's
    size, and when we dump the actions to the userspace, we will do the
    convertions, so it's true size may become bigger than the actions_len.

    But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
    to alloc the skbuff, so the user_skb's size may become insufficient and
    oops will happen like this:
      skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
      ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
      ------------[ cut here ]------------
      kernel BUG at net/core/skbuff.c:129!
      [...]
      Call Trace:
       <IRQ>
       [<ffffffff8148be82>] skb_put+0x43/0x44
       [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
       [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
       [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
       [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
       [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
       [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
       [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
       [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
       [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
       [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
      [...]

    Also we can find that the actions_len is much little than the orig_len:
      crash> struct sw_flow_actions 0xffff8812f539d000
      struct sw_flow_actions {
        rcu = {
          next = 0xffff8812f5398800,
          func = 0xffffe3b00035db32
        },
        orig_len = 1384,
        actions_len = 592,
        actions = 0xffff8812f539d01c
      }

    So as a quick fix, use the orig_len instead of the actions_len to alloc
    the user_skb.

    Last, this oops happened on our system running a relative old kernel, but
    the same risk still exists on the mainline, since we use the wrong
    actions_len from the beginning.

    Fixes: 0e469d3b380c ("datapath: include datapath actions with sampled-"...)
    Cc: Neil McKee <neil.mckee@inmon.com>
    Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
    Acked-by: Pravin B Shelar <pshelar@ovn.org>
    Signed-off-by: David S. Miller <davem@davemloft.net>

Signed-off-by: Greg Rose <gvrose8192@gmail.com>
---
 datapath/actions.c  | 1 +
 datapath/datapath.c | 7 ++++---
 datapath/datapath.h | 2 ++
 3 files changed, 7 insertions(+), 3 deletions(-)

Comments

Joe Stringer Aug. 18, 2017, 9:41 p.m. UTC | #1
On 16 August 2017 at 15:48, Greg Rose <gvrose8192@gmail.com> wrote:
> Upstream commit:
>     commit 494bea39f3201776cdfddc232705f54a0bd210c4
>     Author: Liping Zhang <zlpnobody@gmail.com>
>     Date:   Wed Aug 16 13:30:07 2017 +0800
>
>     For sw_flow_actions, the actions_len only represents the kernel part's
>     size, and when we dump the actions to the userspace, we will do the
>     convertions, so it's true size may become bigger than the actions_len.
>
>     But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
>     to alloc the skbuff, so the user_skb's size may become insufficient and
>     oops will happen like this:
>       skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
>       ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
>       ------------[ cut here ]------------
>       kernel BUG at net/core/skbuff.c:129!
>       [...]
>       Call Trace:
>        <IRQ>
>        [<ffffffff8148be82>] skb_put+0x43/0x44
>        [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
>        [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
>        [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
>        [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
>        [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
>        [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
>        [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
>        [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
>        [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
>        [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
>       [...]
>
>     Also we can find that the actions_len is much little than the orig_len:
>       crash> struct sw_flow_actions 0xffff8812f539d000
>       struct sw_flow_actions {
>         rcu = {
>           next = 0xffff8812f5398800,
>           func = 0xffffe3b00035db32
>         },
>         orig_len = 1384,
>         actions_len = 592,
>         actions = 0xffff8812f539d01c
>       }
>
>     So as a quick fix, use the orig_len instead of the actions_len to alloc
>     the user_skb.
>
>     Last, this oops happened on our system running a relative old kernel, but
>     the same risk still exists on the mainline, since we use the wrong
>     actions_len from the beginning.
>
>     Fixes: 0e469d3b380c ("datapath: include datapath actions with sampled-"...)
>     Cc: Neil McKee <neil.mckee@inmon.com>
>     Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
>     Acked-by: Pravin B Shelar <pshelar@ovn.org>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
>
> Signed-off-by: Greg Rose <gvrose8192@gmail.com>
> ---

Thanks for the backport.

It seems like we're a bit diverged from the latest net-next, if I
follow correctly these patches haven't been backported yet. Would you
mind preparing these, too?

0ed80da518a1 openvswitch: Remove unnecessary newlines from OVS_NLERR uses
c4b2bf6b4a35 openvswitch: Optimize operations for OvS flow_stats.
c57c054eb5b1 openvswitch: Optimize updating for OvS flow_stats.
880388aa3c07 net: Remove all references to SKB_GSO_UDP.

I realise that this particular patch is a long-standing bug that we
can fix (and will need backporting in our tree), but it's nice if we
can keep the patches applied to master here in the same order as
upstream net-next so that it's easier to tell how out of sync we are
by a side-by-side comparison of "git log --oneline" for datapath/ or
net/openvswitch in the corresponding trees. I also realise that this
patch didn't land in net-next yet since it's so new, but it's fairly
safe to assume it'll be applied there next.

Cheers,
Joe
Gregory Rose Aug. 22, 2017, 3:26 p.m. UTC | #2
On 08/18/2017 02:41 PM, Joe Stringer wrote:
> On 16 August 2017 at 15:48, Greg Rose <gvrose8192@gmail.com> wrote:
> > Upstream commit:
> >      commit 494bea39f3201776cdfddc232705f54a0bd210c4
> >      Author: Liping Zhang <zlpnobody@gmail.com>
> >      Date:   Wed Aug 16 13:30:07 2017 +0800
> >
> >      For sw_flow_actions, the actions_len only represents the kernel part's
> >      size, and when we dump the actions to the userspace, we will do the
> >      convertions, so it's true size may become bigger than the actions_len.
> >
> >      But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
> >      to alloc the skbuff, so the user_skb's size may become insufficient and
> >      oops will happen like this:
> >        skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157 head:
> >        ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0 dev:<NULL>
> >        ------------[ cut here ]------------
> >        kernel BUG at net/core/skbuff.c:129!
> >        [...]
> >        Call Trace:
> >         <IRQ>
> >         [<ffffffff8148be82>] skb_put+0x43/0x44
> >         [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
> >         [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448 [openvswitch]
> >         [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
> >         [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
> >         [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
> >         [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8 [openvswitch]
> >         [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106 [openvswitch]
> >         [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
> >         [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
> >         [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
> >        [...]
> >
> >      Also we can find that the actions_len is much little than the orig_len:
> >        crash> struct sw_flow_actions 0xffff8812f539d000
> >        struct sw_flow_actions {
> >          rcu = {
> >            next = 0xffff8812f5398800,
> >            func = 0xffffe3b00035db32
> >          },
> >          orig_len = 1384,
> >          actions_len = 592,
> >          actions = 0xffff8812f539d01c
> >        }
> >
> >      So as a quick fix, use the orig_len instead of the actions_len to alloc
> >      the user_skb.
> >
> >      Last, this oops happened on our system running a relative old kernel, but
> >      the same risk still exists on the mainline, since we use the wrong
> >      actions_len from the beginning.
> >
> >      Fixes: 0e469d3b380c ("datapath: include datapath actions with sampled-"...)
> >      Cc: Neil McKee <neil.mckee@inmon.com>
> >      Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
> >      Acked-by: Pravin B Shelar <pshelar@ovn.org>
> >      Signed-off-by: David S. Miller <davem@davemloft.net>
> >
> > Signed-off-by: Greg Rose <gvrose8192@gmail.com>
> > ---
>
> Thanks for the backport.
>
> It seems like we're a bit diverged from the latest net-next, if I
> follow correctly these patches haven't been backported yet. Would you
> mind preparing these, too?
>
> 0ed80da518a1 openvswitch: Remove unnecessary newlines from OVS_NLERR uses
> c4b2bf6b4a35 openvswitch: Optimize operations for OvS flow_stats.
> c57c054eb5b1 openvswitch: Optimize updating for OvS flow_stats.
> 880388aa3c07 net: Remove all references to SKB_GSO_UDP.
>
> I realise that this particular patch is a long-standing bug that we
> can fix (and will need backporting in our tree), but it's nice if we
> can keep the patches applied to master here in the same order as
> upstream net-next so that it's easier to tell how out of sync we are
> by a side-by-side comparison of "git log --oneline" for datapath/ or
> net/openvswitch in the corresponding trees. I also realise that this
> patch didn't land in net-next yet since it's so new, but it's fairly
> safe to assume it'll be applied there next.
>
> Cheers,
> Joe
>
OK - I had been waiting for patches to hit the net tree before backporting.  I'll get the ones in net-next too and submit a patch series.

Thanks,

- Greg
Joe Stringer Aug. 22, 2017, 5:15 p.m. UTC | #3
On 22 August 2017 at 08:26, Greg Rose <gvrose8192@gmail.com> wrote:
> On 08/18/2017 02:41 PM, Joe Stringer wrote:
>>
>> On 16 August 2017 at 15:48, Greg Rose <gvrose8192@gmail.com> wrote:
>> > Upstream commit:
>> >      commit 494bea39f3201776cdfddc232705f54a0bd210c4
>> >      Author: Liping Zhang <zlpnobody@gmail.com>
>> >      Date:   Wed Aug 16 13:30:07 2017 +0800
>> >
>> >      For sw_flow_actions, the actions_len only represents the kernel
>> > part's
>> >      size, and when we dump the actions to the userspace, we will do the
>> >      convertions, so it's true size may become bigger than the
>> > actions_len.
>> >
>> >      But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the
>> > actions_len
>> >      to alloc the skbuff, so the user_skb's size may become insufficient
>> > and
>> >      oops will happen like this:
>> >        skbuff: skb_over_panic: text:ffffffff8148fabf len:1749 put:157
>> > head:
>> >        ffff881300f39000 data:ffff881300f39000 tail:0x6d5 end:0x6c0
>> > dev:<NULL>
>> >        ------------[ cut here ]------------
>> >        kernel BUG at net/core/skbuff.c:129!
>> >        [...]
>> >        Call Trace:
>> >         <IRQ>
>> >         [<ffffffff8148be82>] skb_put+0x43/0x44
>> >         [<ffffffff8148fabf>] skb_zerocopy+0x6c/0x1f4
>> >         [<ffffffffa0290d36>] queue_userspace_packet+0x3a3/0x448
>> > [openvswitch]
>> >         [<ffffffffa0292023>] ovs_dp_upcall+0x30/0x5c [openvswitch]
>> >         [<ffffffffa028d435>] output_userspace+0x132/0x158 [openvswitch]
>> >         [<ffffffffa01e6890>] ? ip6_rcv_finish+0x74/0x77 [ipv6]
>> >         [<ffffffffa028e277>] do_execute_actions+0xcc1/0xdc8
>> > [openvswitch]
>> >         [<ffffffffa028e3f2>] ovs_execute_actions+0x74/0x106
>> > [openvswitch]
>> >         [<ffffffffa0292130>] ovs_dp_process_packet+0xe1/0xfd
>> > [openvswitch]
>> >         [<ffffffffa0292b77>] ? key_extract+0x63c/0x8d5 [openvswitch]
>> >         [<ffffffffa029848b>] ovs_vport_receive+0xa1/0xc3 [openvswitch]
>> >        [...]
>> >
>> >      Also we can find that the actions_len is much little than the
>> > orig_len:
>> >        crash> struct sw_flow_actions 0xffff8812f539d000
>> >        struct sw_flow_actions {
>> >          rcu = {
>> >            next = 0xffff8812f5398800,
>> >            func = 0xffffe3b00035db32
>> >          },
>> >          orig_len = 1384,
>> >          actions_len = 592,
>> >          actions = 0xffff8812f539d01c
>> >        }
>> >
>> >      So as a quick fix, use the orig_len instead of the actions_len to
>> > alloc
>> >      the user_skb.
>> >
>> >      Last, this oops happened on our system running a relative old
>> > kernel, but
>> >      the same risk still exists on the mainline, since we use the wrong
>> >      actions_len from the beginning.
>> >
>> >      Fixes: 0e469d3b380c ("datapath: include datapath actions with
>> > sampled-"...)
>> >      Cc: Neil McKee <neil.mckee@inmon.com>
>> >      Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
>> >      Acked-by: Pravin B Shelar <pshelar@ovn.org>
>> >      Signed-off-by: David S. Miller <davem@davemloft.net>
>> >
>> > Signed-off-by: Greg Rose <gvrose8192@gmail.com>
>> > ---
>>
>> Thanks for the backport.
>>
>> It seems like we're a bit diverged from the latest net-next, if I
>> follow correctly these patches haven't been backported yet. Would you
>> mind preparing these, too?
>>
>> 0ed80da518a1 openvswitch: Remove unnecessary newlines from OVS_NLERR uses
>> c4b2bf6b4a35 openvswitch: Optimize operations for OvS flow_stats.
>> c57c054eb5b1 openvswitch: Optimize updating for OvS flow_stats.
>> 880388aa3c07 net: Remove all references to SKB_GSO_UDP.
>>
>> I realise that this particular patch is a long-standing bug that we
>> can fix (and will need backporting in our tree), but it's nice if we
>> can keep the patches applied to master here in the same order as
>> upstream net-next so that it's easier to tell how out of sync we are
>> by a side-by-side comparison of "git log --oneline" for datapath/ or
>> net/openvswitch in the corresponding trees. I also realise that this
>> patch didn't land in net-next yet since it's so new, but it's fairly
>> safe to assume it'll be applied there next.
>>
>> Cheers,
>> Joe
>>
> OK - I had been waiting for patches to hit the net tree before backporting.
> I'll get the ones in net-next too and submit a patch series.

Thanks! I'll keep an eye out for them.
diff mbox

Patch

diff --git a/datapath/actions.c b/datapath/actions.c
index 59d91b2..ad18c2c 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -1348,6 +1348,7 @@  int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
 		goto out;
 	}
 
+	OVS_CB(skb)->acts_origlen = acts->orig_len;
 	err = do_execute_actions(dp, skb, key,
 				 acts->actions, acts->actions_len);
 
diff --git a/datapath/datapath.c b/datapath/datapath.c
index b565fc5..1780819 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -388,7 +388,7 @@  static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
 }
 
 static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
-			      unsigned int hdrlen)
+			      unsigned int hdrlen, int actions_attrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
 		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
@@ -405,7 +405,7 @@  static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info,
 
 	/* OVS_PACKET_ATTR_ACTIONS */
 	if (upcall_info->actions_len)
-		size += nla_total_size(upcall_info->actions_len);
+		size += nla_total_size(actions_attrlen);
 
 	/* OVS_PACKET_ATTR_MRU */
 	if (upcall_info->mru)
@@ -472,7 +472,8 @@  static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	else
 		hlen = skb->len;
 
-	len = upcall_msg_size(upcall_info, hlen - cutlen);
+	len = upcall_msg_size(upcall_info, hlen - cutlen,
+			      OVS_CB(skb)->acts_origlen);
 	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
diff --git a/datapath/datapath.h b/datapath/datapath.h
index f20deed..70ad0ac 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -100,11 +100,13 @@  struct datapath {
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
  * fragmented.
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  */
 struct ovs_skb_cb {
 	struct vport		*input_vport;
 	u16			mru;
+	u16			acts_origlen;
 	u32			cutlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)