diff mbox

[ovs-dev] netdev-dummy: fix crash with more than one passive connection

Message ID 1467848392-12085-1-git-send-email-lrichard@redhat.com
State Accepted
Headers show

Commit Message

Lance Richardson July 6, 2016, 11:39 p.m. UTC
Investigation found that Some of the occasional failures in the
"ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
by ovs-vswitchd crashing with SIGSEGV. It turns out that the
crash occurrs when the number of netdev-dummy passive connections
transitions from 1 to 2.  When xrealloc() copies the array of
dummy_packet_stream structures from the original buffer to a
newly allocated one, the struct ovs_list txq member of the structure
becomes corrupt (e.g. if ovs_list_is_empty() would have returned
false before the copy, it will return true after the copy, which
will lead to a crash when the bogus packet buffer on the list is
dereferenced).

Fix by taking a hint from David Wheeler and adding a level of
indirection.

Signed-off-by: Lance Richardson <lrichard@redhat.com>
---
 lib/netdev-dummy.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

Comments

Lance Richardson July 18, 2016, 4:29 p.m. UTC | #1
> From: "Lance Richardson" <lrichard@redhat.com>
> To: dev@openvswitch.org
> Sent: Wednesday, July 6, 2016 7:39:52 PM
> Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than one	passive connection
> 
> Investigation found that Some of the occasional failures in the
> "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
> by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> crash occurrs when the number of netdev-dummy passive connections
> transitions from 1 to 2.  When xrealloc() copies the array of
> dummy_packet_stream structures from the original buffer to a
> newly allocated one, the struct ovs_list txq member of the structure
> becomes corrupt (e.g. if ovs_list_is_empty() would have returned
> false before the copy, it will return true after the copy, which
> will lead to a crash when the bogus packet buffer on the list is
> dereferenced).
> 
> Fix by taking a hint from David Wheeler and adding a level of
> indirection.
> 
> Signed-off-by: Lance Richardson <lrichard@redhat.com>
> ---
>  lib/netdev-dummy.c | 17 ++++++++++-------
>  1 file changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index 24c107e..c99db2b 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -68,7 +68,7 @@ enum dummy_netdev_conn_state {
>  
>  struct dummy_packet_pconn {
>      struct pstream *pstream;
> -    struct dummy_packet_stream *streams;
> +    struct dummy_packet_stream **streams;
>      size_t n_streams;
>  };
>  
> @@ -328,7 +328,8 @@ dummy_packet_conn_close(struct dummy_packet_conn *conn)
>      case PASSIVE:
>          pstream_close(pconn->pstream);
>          for (i = 0; i < pconn->n_streams; i++) {
> -            dummy_packet_stream_close(&pconn->streams[i]);
> +            dummy_packet_stream_close(pconn->streams[i]);
> +            free(pconn->streams[i]);
>          }
>          free(pconn->streams);
>          pconn->pstream = NULL;
> @@ -446,8 +447,9 @@ dummy_pconn_run(struct netdev_dummy *dev)
>  
>          pconn->streams = xrealloc(pconn->streams,
>                                  ((pconn->n_streams + 1)
> -                                 * sizeof *s));
> -        s = &pconn->streams[pconn->n_streams++];
> +                                 * sizeof s));
> +        s = xmalloc(sizeof *s);
> +        pconn->streams[pconn->n_streams++] = s;
>          dummy_packet_stream_init(s, new_stream);
>      } else if (error != EAGAIN) {
>          VLOG_WARN("%s: accept failed (%s)",
> @@ -458,7 +460,7 @@ dummy_pconn_run(struct netdev_dummy *dev)
>      }
>  
>      for (i = 0; i < pconn->n_streams; i++) {
> -        struct dummy_packet_stream *s = &pconn->streams[i];
> +        struct dummy_packet_stream *s = pconn->streams[i];
>  
>          error = dummy_packet_stream_run(dev, s);
>          if (error) {
> @@ -466,6 +468,7 @@ dummy_pconn_run(struct netdev_dummy *dev)
>                       stream_get_name(s->stream),
>                       ovs_retval_to_string(error));
>              dummy_packet_stream_close(s);
> +            free(s);
>              pconn->streams[i] = pconn->streams[--pconn->n_streams];
>          }
>      }
> @@ -553,7 +556,7 @@ dummy_packet_conn_wait(struct dummy_packet_conn *conn)
>      case PASSIVE:
>          pstream_wait(conn->u.pconn.pstream);
>          for (i = 0; i < conn->u.pconn.n_streams; i++) {
> -            struct dummy_packet_stream *s = &conn->u.pconn.streams[i];
> +            struct dummy_packet_stream *s = conn->u.pconn.streams[i];
>              dummy_packet_stream_wait(s);
>          }
>          break;
> @@ -578,7 +581,7 @@ dummy_packet_conn_send(struct dummy_packet_conn *conn,
>      switch (conn->type) {
>      case PASSIVE:
>          for (i = 0; i < conn->u.pconn.n_streams; i++) {
> -            struct dummy_packet_stream *s = &conn->u.pconn.streams[i];
> +            struct dummy_packet_stream *s = conn->u.pconn.streams[i];
>  
>              dummy_packet_stream_send(s, buffer, size);
>              pstream_wait(conn->u.pconn.pstream);
> --
> 2.5.5
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev
> 

Here is a small script that reliably reproduces the crash in ovs-vswitchd.
I don't have an explanation for why we have two connections to the same
port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case (this
happens infrequently), perhaps it's something like the active connect from
a peer ovs-vswitchd being interrupted and re-tried.

#!/bin/bash

set -ex

PWD=$(pwd)
export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH

ovs_setenv() {
    export OVS_RUNDIR=${PWD}/$1
    export OVS_LOGDIR=${PWD}/$1
    export OVS_DBDIR=${PWD}/$1
    export OVS_SYSCONFDIR=${PWD}/$1
    export OVS_PKGDATADIR=${PWD}/$1
}

for x in repro_main repro_hv1 repro_hv2; do
    mkdir -p $x
    rm -f $x/*
    ovs_setenv $x
    : > $OVS_RUNDIR/.conf.db.~lock~
    ovsdb-tool create $OVS_RUNDIR/conf.db vswitchd/vswitch.ovsschema
    ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock  -vconsole:off --detach --no-chdir --pidfile --log-file
    ovs-vsctl --no-wait -- init
    ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
done

ovs_setenv repro_main
ovs-vsctl add-br foo  \
          -- add-port foo p1 \
          -- set Interface p1 options:pstream="punix:$PWD/repro_main/p1.sock"

ovs_setenv repro_hv1
ovs-vsctl add-br br1 \
          -- add-port br1 p1 \
          -- set Interface p1 options:stream="unix:$PWD/repro_main/p1.sock"

ovs_setenv repro_hv2
ovs-vsctl add-br br1 \
          -- add-port br1 p1 \
          -- set Interface p1 options:stream="unix:$PWD/repro_main/p1.sock"
Ryan Moats July 19, 2016, 3:36 a.m. UTC | #2
"dev" <dev-bounces@openvswitch.org> wrote on 07/18/2016 11:29:28 AM:

>
> > From: "Lance Richardson" <lrichard@redhat.com>
> > To: dev@openvswitch.org
> > Sent: Wednesday, July 6, 2016 7:39:52 PM
> > Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than
> one   passive connection
> >
> > Investigation found that Some of the occasional failures in the
> > "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
> > by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> > crash occurrs when the number of netdev-dummy passive connections
> > transitions from 1 to 2.  When xrealloc() copies the array of
> > dummy_packet_stream structures from the original buffer to a
> > newly allocated one, the struct ovs_list txq member of the structure
> > becomes corrupt (e.g. if ovs_list_is_empty() would have returned
> > false before the copy, it will return true after the copy, which
> > will lead to a crash when the bogus packet buffer on the list is
> > dereferenced).
> >
> > Fix by taking a hint from David Wheeler and adding a level of
> > indirection.
> >
> > Signed-off-by: Lance Richardson <lrichard@redhat.com>

[snip]

>
> Here is a small script that reliably reproduces the crash in
ovs-vswitchd.
> I don't have an explanation for why we have two connections to the same
> port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case (this
> happens infrequently), perhaps it's something like the active connect
from
> a peer ovs-vswitchd being interrupted and re-tried.
>
> #!/bin/bash
>
> set -ex
>
> PWD=$(pwd)
> export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH
>
> ovs_setenv() {
>     export OVS_RUNDIR=${PWD}/$1
>     export OVS_LOGDIR=${PWD}/$1
>     export OVS_DBDIR=${PWD}/$1
>     export OVS_SYSCONFDIR=${PWD}/$1
>     export OVS_PKGDATADIR=${PWD}/$1
> }
>
> for x in repro_main repro_hv1 repro_hv2; do
>     mkdir -p $x
>     rm -f $x/*
>     ovs_setenv $x
>     : > $OVS_RUNDIR/.conf.db.~lock~
>     ovsdb-tool create $OVS_RUNDIR/conf.db vswitchd/vswitch.ovsschema
>     ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock  -vconsole:off
> --detach --no-chdir --pidfile --log-file
>     ovs-vsctl --no-wait -- init
>     ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -
> vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
> done
>
> ovs_setenv repro_main
> ovs-vsctl add-br foo  \
>           -- add-port foo p1 \
>           -- set Interface p1
options:pstream="punix:$PWD/repro_main/p1.sock"
>
> ovs_setenv repro_hv1
> ovs-vsctl add-br br1 \
>           -- add-port br1 p1 \
>           -- set Interface p1
options:stream="unix:$PWD/repro_main/p1.sock"
>
> ovs_setenv repro_hv2
> ovs-vsctl add-br br1 \
>           -- add-port br1 p1 \
>           -- set Interface p1
options:stream="unix:$PWD/repro_main/p1.sock"

I like this, but I think I'm going to be consistent and ask if you
can spin a new version with the above script as a unit test so that
we can see the crash before applying the rest of the patch and then
verify that it doesn't happen with the patch.

Ryan
Lance Richardson July 19, 2016, 2:07 p.m. UTC | #3
----- Original Message -----
> From: "Ryan Moats" <rmoats@us.ibm.com>
> To: "Lance Richardson" <lrichard@redhat.com>
> Cc: dev@openvswitch.org
> Sent: Monday, July 18, 2016 11:36:27 PM
> Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than	one	passive connection
> 
> "dev" <dev-bounces@openvswitch.org> wrote on 07/18/2016 11:29:28 AM:
> 
> >
> > > From: "Lance Richardson" <lrichard@redhat.com>
> > > To: dev@openvswitch.org
> > > Sent: Wednesday, July 6, 2016 7:39:52 PM
> > > Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than
> > one   passive connection
> > >
> > > Investigation found that Some of the occasional failures in the
> > > "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
> > > by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> > > crash occurrs when the number of netdev-dummy passive connections
> > > transitions from 1 to 2.  When xrealloc() copies the array of
> > > dummy_packet_stream structures from the original buffer to a
> > > newly allocated one, the struct ovs_list txq member of the structure
> > > becomes corrupt (e.g. if ovs_list_is_empty() would have returned
> > > false before the copy, it will return true after the copy, which
> > > will lead to a crash when the bogus packet buffer on the list is
> > > dereferenced).
> > >
> > > Fix by taking a hint from David Wheeler and adding a level of
> > > indirection.
> > >
> > > Signed-off-by: Lance Richardson <lrichard@redhat.com>
> 
> [snip]
> 
> >
> > Here is a small script that reliably reproduces the crash in
> ovs-vswitchd.
> > I don't have an explanation for why we have two connections to the same
> > port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case (this
> > happens infrequently), perhaps it's something like the active connect
> from
> > a peer ovs-vswitchd being interrupted and re-tried.
> >
> > #!/bin/bash
> >
> > set -ex
> >
> > PWD=$(pwd)
> > export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH
> >
> > ovs_setenv() {
> >     export OVS_RUNDIR=${PWD}/$1
> >     export OVS_LOGDIR=${PWD}/$1
> >     export OVS_DBDIR=${PWD}/$1
> >     export OVS_SYSCONFDIR=${PWD}/$1
> >     export OVS_PKGDATADIR=${PWD}/$1
> > }
> >
> > for x in repro_main repro_hv1 repro_hv2; do
> >     mkdir -p $x
> >     rm -f $x/*
> >     ovs_setenv $x
> >     : > $OVS_RUNDIR/.conf.db.~lock~
> >     ovsdb-tool create $OVS_RUNDIR/conf.db vswitchd/vswitch.ovsschema
> >     ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock  -vconsole:off
> > --detach --no-chdir --pidfile --log-file
> >     ovs-vsctl --no-wait -- init
> >     ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -
> > vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
> > done
> >
> > ovs_setenv repro_main
> > ovs-vsctl add-br foo  \
> >           -- add-port foo p1 \
> >           -- set Interface p1
> options:pstream="punix:$PWD/repro_main/p1.sock"
> >
> > ovs_setenv repro_hv1
> > ovs-vsctl add-br br1 \
> >           -- add-port br1 p1 \
> >           -- set Interface p1
> options:stream="unix:$PWD/repro_main/p1.sock"
> >
> > ovs_setenv repro_hv2
> > ovs-vsctl add-br br1 \
> >           -- add-port br1 p1 \
> >           -- set Interface p1
> options:stream="unix:$PWD/repro_main/p1.sock"
> 
> I like this, but I think I'm going to be consistent and ask if you
> can spin a new version with the above script as a unit test so that
> we can see the crash before applying the rest of the patch and then
> verify that it doesn't happen with the patch.
> 
> Ryan
> 

Hi Ryan,

Thanks for the review.  I'm a little unclear on whether it's a good idea
to add a test case that doesn't verify any functionality and is no longer
useful as soon as it is merged.  I was rather hoping that the script provided
above would serve the "see the crash before applying" function.

I'm fine with writing and posting such a test case if that's the policy,
but I do wonder how many new test cases we'll have if one is written for
every crash that has been fixed, and how effective this will be at detecting
future crash causes given the enormous number of ways code can be changed
to produce a crash.

On the other hand it probably would be a good idea to add test cases to
verify netdev-dummy functionality (especially for things like multiple
passive connection support that aren't already used in other test cases).
This is probably a larger effort though.

Thanks,

   Lance
Ryan Moats July 19, 2016, 2:20 p.m. UTC | #4
Lance Richardson <lrichard@redhat.com> wrote on 07/19/2016 09:07:41 AM:

> From: Lance Richardson <lrichard@redhat.com>
> To: Ryan Moats/Omaha/IBM@IBMUS
> Cc: dev@openvswitch.org
> Date: 07/19/2016 09:07 AM
> Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> than one passive connection
>
> ----- Original Message -----
> > From: "Ryan Moats" <rmoats@us.ibm.com>
> > To: "Lance Richardson" <lrichard@redhat.com>
> > Cc: dev@openvswitch.org
> > Sent: Monday, July 18, 2016 11:36:27 PM
> > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> than   one   passive connection
> >
> > "dev" <dev-bounces@openvswitch.org> wrote on 07/18/2016 11:29:28 AM:
> >
> > >
> > > > From: "Lance Richardson" <lrichard@redhat.com>
> > > > To: dev@openvswitch.org
> > > > Sent: Wednesday, July 6, 2016 7:39:52 PM
> > > > Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than
> > > one   passive connection
> > > >
> > > > Investigation found that Some of the occasional failures in the
> > > > "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
> > > > by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> > > > crash occurrs when the number of netdev-dummy passive connections
> > > > transitions from 1 to 2.  When xrealloc() copies the array of
> > > > dummy_packet_stream structures from the original buffer to a
> > > > newly allocated one, the struct ovs_list txq member of the
structure
> > > > becomes corrupt (e.g. if ovs_list_is_empty() would have returned
> > > > false before the copy, it will return true after the copy, which
> > > > will lead to a crash when the bogus packet buffer on the list is
> > > > dereferenced).
> > > >
> > > > Fix by taking a hint from David Wheeler and adding a level of
> > > > indirection.
> > > >
> > > > Signed-off-by: Lance Richardson <lrichard@redhat.com>
> >
> > [snip]
> >
> > >
> > > Here is a small script that reliably reproduces the crash in
> > ovs-vswitchd.
> > > I don't have an explanation for why we have two connections to the
same
> > > port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case
(this
> > > happens infrequently), perhaps it's something like the active connect
> > from
> > > a peer ovs-vswitchd being interrupted and re-tried.
> > >
> > > #!/bin/bash
> > >
> > > set -ex
> > >
> > > PWD=$(pwd)
> > > export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH
> > >
> > > ovs_setenv() {
> > >     export OVS_RUNDIR=${PWD}/$1
> > >     export OVS_LOGDIR=${PWD}/$1
> > >     export OVS_DBDIR=${PWD}/$1
> > >     export OVS_SYSCONFDIR=${PWD}/$1
> > >     export OVS_PKGDATADIR=${PWD}/$1
> > > }
> > >
> > > for x in repro_main repro_hv1 repro_hv2; do
> > >     mkdir -p $x
> > >     rm -f $x/*
> > >     ovs_setenv $x
> > >     : > $OVS_RUNDIR/.conf.db.~lock~
> > >     ovsdb-tool create $OVS_RUNDIR/conf.db vswitchd/vswitch.ovsschema
> > >     ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock  -vconsole:off
> > > --detach --no-chdir --pidfile --log-file
> > >     ovs-vsctl --no-wait -- init
> > >     ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -
> > > vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
> > > done
> > >
> > > ovs_setenv repro_main
> > > ovs-vsctl add-br foo  \
> > >           -- add-port foo p1 \
> > >           -- set Interface p1
> > options:pstream="punix:$PWD/repro_main/p1.sock"
> > >
> > > ovs_setenv repro_hv1
> > > ovs-vsctl add-br br1 \
> > >           -- add-port br1 p1 \
> > >           -- set Interface p1
> > options:stream="unix:$PWD/repro_main/p1.sock"
> > >
> > > ovs_setenv repro_hv2
> > > ovs-vsctl add-br br1 \
> > >           -- add-port br1 p1 \
> > >           -- set Interface p1
> > options:stream="unix:$PWD/repro_main/p1.sock"
> >
> > I like this, but I think I'm going to be consistent and ask if you
> > can spin a new version with the above script as a unit test so that
> > we can see the crash before applying the rest of the patch and then
> > verify that it doesn't happen with the patch.
> >
> > Ryan
> >
>
> Hi Ryan,
>
> Thanks for the review.  I'm a little unclear on whether it's a good idea
> to add a test case that doesn't verify any functionality and is no longer
> useful as soon as it is merged.  I was rather hoping that the script
provided
> above would serve the "see the crash before applying" function.
>
> I'm fine with writing and posting such a test case if that's the policy,
> but I do wonder how many new test cases we'll have if one is written for
> every crash that has been fixed, and how effective this will be at
detecting
> future crash causes given the enormous number of ways code can be changed
> to produce a crash.
>
> On the other hand it probably would be a good idea to add test cases to
> verify netdev-dummy functionality (especially for things like multiple
> passive connection support that aren't already used in other test cases).
> This is probably a larger effort though.

My reasoning (and it is purely personal, I'm not a committer or anything :)
was that since we *know* there is a crash lurking there, a canary test
would be useful to make sure we don't regress and allow it to happen
again...

If others think that I'm being overly-paranoid, then I'll happily drop the
request.

Ryan
Lance Richardson July 19, 2016, 6:09 p.m. UTC | #5
----- Original Message -----
> From: "Ryan Moats" <rmoats@us.ibm.com>
> To: "Lance Richardson" <lrichard@redhat.com>
> Cc: dev@openvswitch.org
> Sent: Tuesday, July 19, 2016 10:20:37 AM
> Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than	one	passive connection
> 
> Lance Richardson <lrichard@redhat.com> wrote on 07/19/2016 09:07:41 AM:
> 
> > From: Lance Richardson <lrichard@redhat.com>
> > To: Ryan Moats/Omaha/IBM@IBMUS
> > Cc: dev@openvswitch.org
> > Date: 07/19/2016 09:07 AM
> > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> > than one passive connection
> >
> > ----- Original Message -----
> > > From: "Ryan Moats" <rmoats@us.ibm.com>
> > > To: "Lance Richardson" <lrichard@redhat.com>
> > > Cc: dev@openvswitch.org
> > > Sent: Monday, July 18, 2016 11:36:27 PM
> > > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> > than   one   passive connection
> > >
> > > "dev" <dev-bounces@openvswitch.org> wrote on 07/18/2016 11:29:28 AM:
> > >
> > > >
> > > > > From: "Lance Richardson" <lrichard@redhat.com>
> > > > > To: dev@openvswitch.org
> > > > > Sent: Wednesday, July 6, 2016 7:39:52 PM
> > > > > Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than
> > > > one   passive connection
> > > > >
> > > > > Investigation found that Some of the occasional failures in the
> > > > > "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
> > > > > by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> > > > > crash occurrs when the number of netdev-dummy passive connections
> > > > > transitions from 1 to 2.  When xrealloc() copies the array of
> > > > > dummy_packet_stream structures from the original buffer to a
> > > > > newly allocated one, the struct ovs_list txq member of the
> structure
> > > > > becomes corrupt (e.g. if ovs_list_is_empty() would have returned
> > > > > false before the copy, it will return true after the copy, which
> > > > > will lead to a crash when the bogus packet buffer on the list is
> > > > > dereferenced).
> > > > >
> > > > > Fix by taking a hint from David Wheeler and adding a level of
> > > > > indirection.
> > > > >
> > > > > Signed-off-by: Lance Richardson <lrichard@redhat.com>
> > >
> > > [snip]
> > >
> > > >
> > > > Here is a small script that reliably reproduces the crash in
> > > ovs-vswitchd.
> > > > I don't have an explanation for why we have two connections to the
> same
> > > > port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case
> (this
> > > > happens infrequently), perhaps it's something like the active connect
> > > from
> > > > a peer ovs-vswitchd being interrupted and re-tried.
> > > >
> > > > #!/bin/bash
> > > >
> > > > set -ex
> > > >
> > > > PWD=$(pwd)
> > > > export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH
> > > >
> > > > ovs_setenv() {
> > > >     export OVS_RUNDIR=${PWD}/$1
> > > >     export OVS_LOGDIR=${PWD}/$1
> > > >     export OVS_DBDIR=${PWD}/$1
> > > >     export OVS_SYSCONFDIR=${PWD}/$1
> > > >     export OVS_PKGDATADIR=${PWD}/$1
> > > > }
> > > >
> > > > for x in repro_main repro_hv1 repro_hv2; do
> > > >     mkdir -p $x
> > > >     rm -f $x/*
> > > >     ovs_setenv $x
> > > >     : > $OVS_RUNDIR/.conf.db.~lock~
> > > >     ovsdb-tool create $OVS_RUNDIR/conf.db vswitchd/vswitch.ovsschema
> > > >     ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock  -vconsole:off
> > > > --detach --no-chdir --pidfile --log-file
> > > >     ovs-vsctl --no-wait -- init
> > > >     ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -
> > > > vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
> > > > done
> > > >
> > > > ovs_setenv repro_main
> > > > ovs-vsctl add-br foo  \
> > > >           -- add-port foo p1 \
> > > >           -- set Interface p1
> > > options:pstream="punix:$PWD/repro_main/p1.sock"
> > > >
> > > > ovs_setenv repro_hv1
> > > > ovs-vsctl add-br br1 \
> > > >           -- add-port br1 p1 \
> > > >           -- set Interface p1
> > > options:stream="unix:$PWD/repro_main/p1.sock"
> > > >
> > > > ovs_setenv repro_hv2
> > > > ovs-vsctl add-br br1 \
> > > >           -- add-port br1 p1 \
> > > >           -- set Interface p1
> > > options:stream="unix:$PWD/repro_main/p1.sock"
> > >
> > > I like this, but I think I'm going to be consistent and ask if you
> > > can spin a new version with the above script as a unit test so that
> > > we can see the crash before applying the rest of the patch and then
> > > verify that it doesn't happen with the patch.
> > >
> > > Ryan
> > >
> >
> > Hi Ryan,
> >
> > Thanks for the review.  I'm a little unclear on whether it's a good idea
> > to add a test case that doesn't verify any functionality and is no longer
> > useful as soon as it is merged.  I was rather hoping that the script
> provided
> > above would serve the "see the crash before applying" function.
> >
> > I'm fine with writing and posting such a test case if that's the policy,
> > but I do wonder how many new test cases we'll have if one is written for
> > every crash that has been fixed, and how effective this will be at
> detecting
> > future crash causes given the enormous number of ways code can be changed
> > to produce a crash.
> >
> > On the other hand it probably would be a good idea to add test cases to
> > verify netdev-dummy functionality (especially for things like multiple
> > passive connection support that aren't already used in other test cases).
> > This is probably a larger effort though.
> 
> My reasoning (and it is purely personal, I'm not a committer or anything :)
> was that since we *know* there is a crash lurking there, a canary test
> would be useful to make sure we don't regress and allow it to happen
> again...
> 
> If others think that I'm being overly-paranoid, then I'll happily drop the
> request.
> 
> Ryan
> 
OK, thanks Ryan... will wait for input from other reviewers.
Ben Pfaff July 22, 2016, 10:18 p.m. UTC | #6
On Tue, Jul 19, 2016 at 09:20:37AM -0500, Ryan Moats wrote:
> Lance Richardson <lrichard@redhat.com> wrote on 07/19/2016 09:07:41 AM:
> 
> > From: Lance Richardson <lrichard@redhat.com>
> > To: Ryan Moats/Omaha/IBM@IBMUS
> > Cc: dev@openvswitch.org
> > Date: 07/19/2016 09:07 AM
> > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> > than one passive connection
> >
> > ----- Original Message -----
> > > From: "Ryan Moats" <rmoats@us.ibm.com>
> > > To: "Lance Richardson" <lrichard@redhat.com>
> > > Cc: dev@openvswitch.org
> > > Sent: Monday, July 18, 2016 11:36:27 PM
> > > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> > than   one   passive connection
> > >
> > > "dev" <dev-bounces@openvswitch.org> wrote on 07/18/2016 11:29:28 AM:
> > >
> > > >
> > > > > From: "Lance Richardson" <lrichard@redhat.com>
> > > > > To: dev@openvswitch.org
> > > > > Sent: Wednesday, July 6, 2016 7:39:52 PM
> > > > > Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more than
> > > > one   passive connection
> > > > >
> > > > > Investigation found that Some of the occasional failures in the
> > > > > "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are caused
> > > > > by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> > > > > crash occurrs when the number of netdev-dummy passive connections
> > > > > transitions from 1 to 2.  When xrealloc() copies the array of
> > > > > dummy_packet_stream structures from the original buffer to a
> > > > > newly allocated one, the struct ovs_list txq member of the
> structure
> > > > > becomes corrupt (e.g. if ovs_list_is_empty() would have returned
> > > > > false before the copy, it will return true after the copy, which
> > > > > will lead to a crash when the bogus packet buffer on the list is
> > > > > dereferenced).
> > > > >
> > > > > Fix by taking a hint from David Wheeler and adding a level of
> > > > > indirection.
> > > > >
> > > > > Signed-off-by: Lance Richardson <lrichard@redhat.com>
> > >
> > > [snip]
> > >
> > > >
> > > > Here is a small script that reliably reproduces the crash in
> > > ovs-vswitchd.
> > > > I don't have an explanation for why we have two connections to the
> same
> > > > port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case
> (this
> > > > happens infrequently), perhaps it's something like the active connect
> > > from
> > > > a peer ovs-vswitchd being interrupted and re-tried.
> > > >
> > > > #!/bin/bash
> > > >
> > > > set -ex
> > > >
> > > > PWD=$(pwd)
> > > > export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH
> > > >
> > > > ovs_setenv() {
> > > >     export OVS_RUNDIR=${PWD}/$1
> > > >     export OVS_LOGDIR=${PWD}/$1
> > > >     export OVS_DBDIR=${PWD}/$1
> > > >     export OVS_SYSCONFDIR=${PWD}/$1
> > > >     export OVS_PKGDATADIR=${PWD}/$1
> > > > }
> > > >
> > > > for x in repro_main repro_hv1 repro_hv2; do
> > > >     mkdir -p $x
> > > >     rm -f $x/*
> > > >     ovs_setenv $x
> > > >     : > $OVS_RUNDIR/.conf.db.~lock~
> > > >     ovsdb-tool create $OVS_RUNDIR/conf.db vswitchd/vswitch.ovsschema
> > > >     ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock  -vconsole:off
> > > > --detach --no-chdir --pidfile --log-file
> > > >     ovs-vsctl --no-wait -- init
> > > >     ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -
> > > > vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
> > > > done
> > > >
> > > > ovs_setenv repro_main
> > > > ovs-vsctl add-br foo  \
> > > >           -- add-port foo p1 \
> > > >           -- set Interface p1
> > > options:pstream="punix:$PWD/repro_main/p1.sock"
> > > >
> > > > ovs_setenv repro_hv1
> > > > ovs-vsctl add-br br1 \
> > > >           -- add-port br1 p1 \
> > > >           -- set Interface p1
> > > options:stream="unix:$PWD/repro_main/p1.sock"
> > > >
> > > > ovs_setenv repro_hv2
> > > > ovs-vsctl add-br br1 \
> > > >           -- add-port br1 p1 \
> > > >           -- set Interface p1
> > > options:stream="unix:$PWD/repro_main/p1.sock"
> > >
> > > I like this, but I think I'm going to be consistent and ask if you
> > > can spin a new version with the above script as a unit test so that
> > > we can see the crash before applying the rest of the patch and then
> > > verify that it doesn't happen with the patch.
> > >
> > > Ryan
> > >
> >
> > Hi Ryan,
> >
> > Thanks for the review.  I'm a little unclear on whether it's a good idea
> > to add a test case that doesn't verify any functionality and is no longer
> > useful as soon as it is merged.  I was rather hoping that the script
> provided
> > above would serve the "see the crash before applying" function.
> >
> > I'm fine with writing and posting such a test case if that's the policy,
> > but I do wonder how many new test cases we'll have if one is written for
> > every crash that has been fixed, and how effective this will be at
> detecting
> > future crash causes given the enormous number of ways code can be changed
> > to produce a crash.
> >
> > On the other hand it probably would be a good idea to add test cases to
> > verify netdev-dummy functionality (especially for things like multiple
> > passive connection support that aren't already used in other test cases).
> > This is probably a larger effort though.
> 
> My reasoning (and it is purely personal, I'm not a committer or anything :)
> was that since we *know* there is a crash lurking there, a canary test
> would be useful to make sure we don't regress and allow it to happen
> again...
> 
> If others think that I'm being overly-paranoid, then I'll happily drop the
> request.

I usually agree.

In this case, though, this is a feature that is intended only for
developer testing, primarily within OVS's own tests, and which in fact
cannot be used by end users without making a special effort to enable it
(by adding --enable-dummy to the OVS command line, which distribution
packaging doesn't do).  It's OK if the OVS tests break, we'll fix them.
Ryan Moats July 22, 2016, 10:25 p.m. UTC | #7
Ben Pfaff <blp@ovn.org> wrote on 07/22/2016 05:18:44 PM:

> From: Ben Pfaff <blp@ovn.org>
> To: Ryan Moats/Omaha/IBM@IBMUS
> Cc: Lance Richardson <lrichard@redhat.com>, dev@openvswitch.org
> Date: 07/22/2016 05:18 PM
> Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> than one passive connection
>
> On Tue, Jul 19, 2016 at 09:20:37AM -0500, Ryan Moats wrote:
> > Lance Richardson <lrichard@redhat.com> wrote on 07/19/2016 09:07:41 AM:
> >
> > > From: Lance Richardson <lrichard@redhat.com>
> > > To: Ryan Moats/Omaha/IBM@IBMUS
> > > Cc: dev@openvswitch.org
> > > Date: 07/19/2016 09:07 AM
> > > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> > > than one passive connection
> > >
> > > ----- Original Message -----
> > > > From: "Ryan Moats" <rmoats@us.ibm.com>
> > > > To: "Lance Richardson" <lrichard@redhat.com>
> > > > Cc: dev@openvswitch.org
> > > > Sent: Monday, July 18, 2016 11:36:27 PM
> > > > Subject: Re: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
> > > than   one   passive connection
> > > >
> > > > "dev" <dev-bounces@openvswitch.org> wrote on 07/18/2016 11:29:28
AM:
> > > >
> > > > >
> > > > > > From: "Lance Richardson" <lrichard@redhat.com>
> > > > > > To: dev@openvswitch.org
> > > > > > Sent: Wednesday, July 6, 2016 7:39:52 PM
> > > > > > Subject: [ovs-dev] [PATCH] netdev-dummy: fix crash with more
than
> > > > > one   passive connection
> > > > > >
> > > > > > Investigation found that Some of the occasional failures in the
> > > > > > "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case are
caused
> > > > > > by ovs-vswitchd crashing with SIGSEGV. It turns out that the
> > > > > > crash occurrs when the number of netdev-dummy passive
connections
> > > > > > transitions from 1 to 2.  When xrealloc() copies the array of
> > > > > > dummy_packet_stream structures from the original buffer to a
> > > > > > newly allocated one, the struct ovs_list txq member of the
> > structure
> > > > > > becomes corrupt (e.g. if ovs_list_is_empty() would have
returned
> > > > > > false before the copy, it will return true after the copy,
which
> > > > > > will lead to a crash when the bogus packet buffer on the list
is
> > > > > > dereferenced).
> > > > > >
> > > > > > Fix by taking a hint from David Wheeler and adding a level of
> > > > > > indirection.
> > > > > >
> > > > > > Signed-off-by: Lance Richardson <lrichard@redhat.com>
> > > >
> > > > [snip]
> > > >
> > > > >
> > > > > Here is a small script that reliably reproduces the crash in
> > > > ovs-vswitchd.
> > > > > I don't have an explanation for why we have two connections to
the
> > same
> > > > > port in the "ovn -- vtep: 3 HVs, 1 VIFs/HV, 1 GW, 1 LS" test case
> > (this
> > > > > happens infrequently), perhaps it's something like the active
connect
> > > > from
> > > > > a peer ovs-vswitchd being interrupted and re-tried.
> > > > >
> > > > > #!/bin/bash
> > > > >
> > > > > set -ex
> > > > >
> > > > > PWD=$(pwd)
> > > > > export PATH=${PWD}/vswitchd:${PWD}/utilities:${PWD}/ovsdb:$PATH
> > > > >
> > > > > ovs_setenv() {
> > > > >     export OVS_RUNDIR=${PWD}/$1
> > > > >     export OVS_LOGDIR=${PWD}/$1
> > > > >     export OVS_DBDIR=${PWD}/$1
> > > > >     export OVS_SYSCONFDIR=${PWD}/$1
> > > > >     export OVS_PKGDATADIR=${PWD}/$1
> > > > > }
> > > > >
> > > > > for x in repro_main repro_hv1 repro_hv2; do
> > > > >     mkdir -p $x
> > > > >     rm -f $x/*
> > > > >     ovs_setenv $x
> > > > >     : > $OVS_RUNDIR/.conf.db.~lock~
> > > > >     ovsdb-tool create $OVS_RUNDIR/conf.db
vswitchd/vswitch.ovsschema
> > > > >     ovsdb-server --remote=punix:$OVS_RUNDIR/db.sock
-vconsole:off
> > > > > --detach --no-chdir --pidfile --log-file
> > > > >     ovs-vsctl --no-wait -- init
> > > > >     ovs-vswitchd --enable-dummy=system -vvconn -vofproto_dpif -
> > > > > vunixctl -vconsole:off --detach --no-chdir --pidfile --log-file
> > > > > done
> > > > >
> > > > > ovs_setenv repro_main
> > > > > ovs-vsctl add-br foo  \
> > > > >           -- add-port foo p1 \
> > > > >           -- set Interface p1
> > > > options:pstream="punix:$PWD/repro_main/p1.sock"
> > > > >
> > > > > ovs_setenv repro_hv1
> > > > > ovs-vsctl add-br br1 \
> > > > >           -- add-port br1 p1 \
> > > > >           -- set Interface p1
> > > > options:stream="unix:$PWD/repro_main/p1.sock"
> > > > >
> > > > > ovs_setenv repro_hv2
> > > > > ovs-vsctl add-br br1 \
> > > > >           -- add-port br1 p1 \
> > > > >           -- set Interface p1
> > > > options:stream="unix:$PWD/repro_main/p1.sock"
> > > >
> > > > I like this, but I think I'm going to be consistent and ask if you
> > > > can spin a new version with the above script as a unit test so that
> > > > we can see the crash before applying the rest of the patch and then
> > > > verify that it doesn't happen with the patch.
> > > >
> > > > Ryan
> > > >
> > >
> > > Hi Ryan,
> > >
> > > Thanks for the review.  I'm a little unclear on whether it's a good
idea
> > > to add a test case that doesn't verify any functionality and is no
longer
> > > useful as soon as it is merged.  I was rather hoping that the script
> > provided
> > > above would serve the "see the crash before applying" function.
> > >
> > > I'm fine with writing and posting such a test case if that's the
policy,
> > > but I do wonder how many new test cases we'll have if one is written
for
> > > every crash that has been fixed, and how effective this will be at
> > detecting
> > > future crash causes given the enormous number of ways code can be
changed
> > > to produce a crash.
> > >
> > > On the other hand it probably would be a good idea to add test cases
to
> > > verify netdev-dummy functionality (especially for things like
multiple
> > > passive connection support that aren't already used in other test
cases).
> > > This is probably a larger effort though.
> >
> > My reasoning (and it is purely personal, I'm not a committer or
anything :)
> > was that since we *know* there is a crash lurking there, a canary test
> > would be useful to make sure we don't regress and allow it to happen
> > again...
> >
> > If others think that I'm being overly-paranoid, then I'll happily drop
the
> > request.
>
> I usually agree.
>
> In this case, though, this is a feature that is intended only for
> developer testing, primarily within OVS's own tests, and which in fact
> cannot be used by end users without making a special effort to enable it
> (by adding --enable-dummy to the OVS command line, which distribution
> packaging doesn't do).  It's OK if the OVS tests break, we'll fix them.

Ok, then I withdraw the request - I'd put an Ack on it, but since it
has already landed ....

Ryan
diff mbox

Patch

diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
index 24c107e..c99db2b 100644
--- a/lib/netdev-dummy.c
+++ b/lib/netdev-dummy.c
@@ -68,7 +68,7 @@  enum dummy_netdev_conn_state {
 
 struct dummy_packet_pconn {
     struct pstream *pstream;
-    struct dummy_packet_stream *streams;
+    struct dummy_packet_stream **streams;
     size_t n_streams;
 };
 
@@ -328,7 +328,8 @@  dummy_packet_conn_close(struct dummy_packet_conn *conn)
     case PASSIVE:
         pstream_close(pconn->pstream);
         for (i = 0; i < pconn->n_streams; i++) {
-            dummy_packet_stream_close(&pconn->streams[i]);
+            dummy_packet_stream_close(pconn->streams[i]);
+            free(pconn->streams[i]);
         }
         free(pconn->streams);
         pconn->pstream = NULL;
@@ -446,8 +447,9 @@  dummy_pconn_run(struct netdev_dummy *dev)
 
         pconn->streams = xrealloc(pconn->streams,
                                 ((pconn->n_streams + 1)
-                                 * sizeof *s));
-        s = &pconn->streams[pconn->n_streams++];
+                                 * sizeof s));
+        s = xmalloc(sizeof *s);
+        pconn->streams[pconn->n_streams++] = s;
         dummy_packet_stream_init(s, new_stream);
     } else if (error != EAGAIN) {
         VLOG_WARN("%s: accept failed (%s)",
@@ -458,7 +460,7 @@  dummy_pconn_run(struct netdev_dummy *dev)
     }
 
     for (i = 0; i < pconn->n_streams; i++) {
-        struct dummy_packet_stream *s = &pconn->streams[i];
+        struct dummy_packet_stream *s = pconn->streams[i];
 
         error = dummy_packet_stream_run(dev, s);
         if (error) {
@@ -466,6 +468,7 @@  dummy_pconn_run(struct netdev_dummy *dev)
                      stream_get_name(s->stream),
                      ovs_retval_to_string(error));
             dummy_packet_stream_close(s);
+            free(s);
             pconn->streams[i] = pconn->streams[--pconn->n_streams];
         }
     }
@@ -553,7 +556,7 @@  dummy_packet_conn_wait(struct dummy_packet_conn *conn)
     case PASSIVE:
         pstream_wait(conn->u.pconn.pstream);
         for (i = 0; i < conn->u.pconn.n_streams; i++) {
-            struct dummy_packet_stream *s = &conn->u.pconn.streams[i];
+            struct dummy_packet_stream *s = conn->u.pconn.streams[i];
             dummy_packet_stream_wait(s);
         }
         break;
@@ -578,7 +581,7 @@  dummy_packet_conn_send(struct dummy_packet_conn *conn,
     switch (conn->type) {
     case PASSIVE:
         for (i = 0; i < conn->u.pconn.n_streams; i++) {
-            struct dummy_packet_stream *s = &conn->u.pconn.streams[i];
+            struct dummy_packet_stream *s = conn->u.pconn.streams[i];
 
             dummy_packet_stream_send(s, buffer, size);
             pstream_wait(conn->u.pconn.pstream);