diff mbox series

[ovs-dev] ofproto: fix use-after-free for "ofproto".

Message ID 1638530715-44436-1-git-send-email-wangyunjian@huawei.com
State Changes Requested
Headers show
Series [ovs-dev] ofproto: fix use-after-free for "ofproto". | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

wangyunjian Dec. 3, 2021, 11:25 a.m. UTC
When handler threads lookup a "ofproto" and use it, main thread maybe
remove and free the "ofproto" at the same time. The "ofproto" has not
been protected well, which can lead to an OVS crash.

This patch fixes this by making the "ofproto" lookup RCU-safe by using
cmap instead of hmap and moving remove "ofproto" call before
xlate_txn_commit().

(gdb) bt
  hmap_next (hmap.h:398)
  seq_wake_waiters (seq.c:326)
  seq_change_protected (seq.c:134)
  seq_change (seq.c:144)
  ofproto_dpif_send_async_msg (ofproto_dpif.c:263)
  process_upcall (ofproto_dpif_upcall.c:1782)
  recv_upcalls (ofproto_dpif_upcall.c:1026)
  udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945)
  ovsthread_wrapper (ovs_thread.c:734)

Cc: Justin Pettit <jpettit@ovn.org>
Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user action cookie.")
Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
---
 ofproto/ofproto-dpif.c | 12 ++++++------
 ofproto/ofproto-dpif.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

Comments

wangyunjian Jan. 8, 2022, 9:18 a.m. UTC | #1
Friendly ping.

> -----Original Message-----
> From: wangyunjian
> Sent: Friday, December 3, 2021 7:25 PM
> To: dev@openvswitch.org; i.maximets@ovn.org
> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>; xudingke
> <xudingke@huawei.com>; wangyunjian <wangyunjian@huawei.com>; Justin
> Pettit <jpettit@ovn.org>
> Subject: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> When handler threads lookup a "ofproto" and use it, main thread maybe
> remove and free the "ofproto" at the same time. The "ofproto" has not
> been protected well, which can lead to an OVS crash.
> 
> This patch fixes this by making the "ofproto" lookup RCU-safe by using
> cmap instead of hmap and moving remove "ofproto" call before
> xlate_txn_commit().
> 
> (gdb) bt
>   hmap_next (hmap.h:398)
>   seq_wake_waiters (seq.c:326)
>   seq_change_protected (seq.c:134)
>   seq_change (seq.c:144)
>   ofproto_dpif_send_async_msg (ofproto_dpif.c:263)
>   process_upcall (ofproto_dpif_upcall.c:1782)
>   recv_upcalls (ofproto_dpif_upcall.c:1026)
>   udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945)
>   ovsthread_wrapper (ovs_thread.c:734)
> 
> Cc: Justin Pettit <jpettit@ovn.org>
> Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user
> action cookie.")
> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> ---
>  ofproto/ofproto-dpif.c | 12 ++++++------
>  ofproto/ofproto-dpif.h |  2 +-
>  2 files changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index cba49a99e..aa8d32f81 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -216,8 +216,7 @@ static struct hmap all_ofproto_dpifs_by_name =
> 
> HMAP_INITIALIZER(&all_ofproto_dpifs_by_name);
> 
>  /* All existing ofproto_dpif instances, indexed by ->uuid. */
> -static struct hmap all_ofproto_dpifs_by_uuid =
> -
> HMAP_INITIALIZER(&all_ofproto_dpifs_by_uuid);
> +static struct cmap all_ofproto_dpifs_by_uuid = CMAP_INITIALIZER;
> 
>  static bool ofproto_use_tnl_push_pop = true;
>  static void ofproto_unixctl_init(void);
> @@ -1682,7 +1681,7 @@ construct(struct ofproto *ofproto_)
>      hmap_insert(&all_ofproto_dpifs_by_name,
>                  &ofproto->all_ofproto_dpifs_by_name_node,
>                  hash_string(ofproto->up.name, 0));
> -    hmap_insert(&all_ofproto_dpifs_by_uuid,
> +    cmap_insert(&all_ofproto_dpifs_by_uuid,
>                  &ofproto->all_ofproto_dpifs_by_uuid_node,
>                  uuid_hash(&ofproto->uuid));
>      memset(&ofproto->stats, 0, sizeof ofproto->stats);
> @@ -1778,12 +1777,13 @@ destruct(struct ofproto *ofproto_, bool del)
>      ofproto->backer->need_revalidate = REV_RECONFIGURE;
>      xlate_txn_start();
>      xlate_remove_ofproto(ofproto);
> +    cmap_remove(&all_ofproto_dpifs_by_uuid,
> +                &ofproto->all_ofproto_dpifs_by_uuid_node,
> +                uuid_hash(&ofproto->uuid));
>      xlate_txn_commit();
> 
>      hmap_remove(&all_ofproto_dpifs_by_name,
>                  &ofproto->all_ofproto_dpifs_by_name_node);
> -    hmap_remove(&all_ofproto_dpifs_by_uuid,
> -                &ofproto->all_ofproto_dpifs_by_uuid_node);
> 
>      OFPROTO_FOR_EACH_TABLE (table, &ofproto->up) {
>          CLS_FOR_EACH (rule, up.cr, &table->cls) {
> @@ -5781,7 +5781,7 @@ ofproto_dpif_lookup_by_uuid(const struct uuid
> *uuid)
>  {
>      struct ofproto_dpif *ofproto;
> 
> -    HMAP_FOR_EACH_WITH_HASH (ofproto,
> all_ofproto_dpifs_by_uuid_node,
> +    CMAP_FOR_EACH_WITH_HASH (ofproto,
> all_ofproto_dpifs_by_uuid_node,
>                               uuid_hash(uuid),
> &all_ofproto_dpifs_by_uuid) {
>          if (uuid_equals(&ofproto->uuid, uuid)) {
>              return ofproto;
> diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
> index 191cfcb0d..fba03f2cc 100644
> --- a/ofproto/ofproto-dpif.h
> +++ b/ofproto/ofproto-dpif.h
> @@ -295,7 +295,7 @@ struct ofproto_dpif {
>      struct hmap_node all_ofproto_dpifs_by_name_node;
> 
>      /* In 'all_ofproto_dpifs_by_uuid'. */
> -    struct hmap_node all_ofproto_dpifs_by_uuid_node;
> +    struct cmap_node all_ofproto_dpifs_by_uuid_node;
> 
>      struct ofproto up;
>      struct dpif_backer *backer;
> --
> 2.27.0
Peng He Jan. 18, 2022, 7:22 a.m. UTC | #2
Hi, yunjian and Ilya,

this patch reminds me of the discussion we have in March last year, and
during the discussion, you have spotted this
thread-safety issue of uuid map. Unfortunately, in that email, you did not
reply to the mailist, so I cannot give a link to
the email. I attach it as a reference.

I quote it here:

"
That is an interesting example here...
I can't help but notice that this function is typically called
from different handler or pmd threads and modified by the main thread
while upcalls enabled.  And hmap is not a thread-safe structure.
I guess, we have another possible problem here.  We need to protect
at least this hmap and the other one with rw lock or something...
Am I right or am I missing something?  What else we need to protect?
"

And this patch fixes exactly the problem you stated.



wangyunjian via dev <ovs-dev@openvswitch.org> 于2022年1月8日周六 17:18写道:

> Friendly ping.
>
> > -----Original Message-----
> > From: wangyunjian
> > Sent: Friday, December 3, 2021 7:25 PM
> > To: dev@openvswitch.org; i.maximets@ovn.org
> > Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>; xudingke
> > <xudingke@huawei.com>; wangyunjian <wangyunjian@huawei.com>; Justin
> > Pettit <jpettit@ovn.org>
> > Subject: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> >
> > When handler threads lookup a "ofproto" and use it, main thread maybe
> > remove and free the "ofproto" at the same time. The "ofproto" has not
> > been protected well, which can lead to an OVS crash.
> >
> > This patch fixes this by making the "ofproto" lookup RCU-safe by using
> > cmap instead of hmap and moving remove "ofproto" call before
> > xlate_txn_commit().
> >
> > (gdb) bt
> >   hmap_next (hmap.h:398)
> >   seq_wake_waiters (seq.c:326)
> >   seq_change_protected (seq.c:134)
> >   seq_change (seq.c:144)
> >   ofproto_dpif_send_async_msg (ofproto_dpif.c:263)
> >   process_upcall (ofproto_dpif_upcall.c:1782)
> >   recv_upcalls (ofproto_dpif_upcall.c:1026)
> >   udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945)
> >   ovsthread_wrapper (ovs_thread.c:734)
> >
> > Cc: Justin Pettit <jpettit@ovn.org>
> > Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to
> user
> > action cookie.")
> > Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
> > ---
> >  ofproto/ofproto-dpif.c | 12 ++++++------
> >  ofproto/ofproto-dpif.h |  2 +-
> >  2 files changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> > index cba49a99e..aa8d32f81 100644
> > --- a/ofproto/ofproto-dpif.c
> > +++ b/ofproto/ofproto-dpif.c
> > @@ -216,8 +216,7 @@ static struct hmap all_ofproto_dpifs_by_name =
> >
> > HMAP_INITIALIZER(&all_ofproto_dpifs_by_name);
> >
> >  /* All existing ofproto_dpif instances, indexed by ->uuid. */
> > -static struct hmap all_ofproto_dpifs_by_uuid =
> > -
> > HMAP_INITIALIZER(&all_ofproto_dpifs_by_uuid);
> > +static struct cmap all_ofproto_dpifs_by_uuid = CMAP_INITIALIZER;
> >
> >  static bool ofproto_use_tnl_push_pop = true;
> >  static void ofproto_unixctl_init(void);
> > @@ -1682,7 +1681,7 @@ construct(struct ofproto *ofproto_)
> >      hmap_insert(&all_ofproto_dpifs_by_name,
> >                  &ofproto->all_ofproto_dpifs_by_name_node,
> >                  hash_string(ofproto->up.name, 0));
> > -    hmap_insert(&all_ofproto_dpifs_by_uuid,
> > +    cmap_insert(&all_ofproto_dpifs_by_uuid,
> >                  &ofproto->all_ofproto_dpifs_by_uuid_node,
> >                  uuid_hash(&ofproto->uuid));
> >      memset(&ofproto->stats, 0, sizeof ofproto->stats);
> > @@ -1778,12 +1777,13 @@ destruct(struct ofproto *ofproto_, bool del)
> >      ofproto->backer->need_revalidate = REV_RECONFIGURE;
> >      xlate_txn_start();
> >      xlate_remove_ofproto(ofproto);
> > +    cmap_remove(&all_ofproto_dpifs_by_uuid,
> > +                &ofproto->all_ofproto_dpifs_by_uuid_node,
> > +                uuid_hash(&ofproto->uuid));
>

I guess some comments are needed here, you actually make use of
the rcu_synchronize in the xlate_txn_commit to avoid access to the
ofproto from other thread through uuid map.


> >      xlate_txn_commit();
> >
> >      hmap_remove(&all_ofproto_dpifs_by_name,
> >                  &ofproto->all_ofproto_dpifs_by_name_node);
> > -    hmap_remove(&all_ofproto_dpifs_by_uuid,
> > -                &ofproto->all_ofproto_dpifs_by_uuid_node);
> >
> >      OFPROTO_FOR_EACH_TABLE (table, &ofproto->up) {
> >          CLS_FOR_EACH (rule, up.cr, &table->cls) {
> > @@ -5781,7 +5781,7 @@ ofproto_dpif_lookup_by_uuid(const struct uuid
> > *uuid)
> >  {
> >      struct ofproto_dpif *ofproto;
> >
> > -    HMAP_FOR_EACH_WITH_HASH (ofproto,
> > all_ofproto_dpifs_by_uuid_node,
> > +    CMAP_FOR_EACH_WITH_HASH (ofproto,
> > all_ofproto_dpifs_by_uuid_node,
> >                               uuid_hash(uuid),
> > &all_ofproto_dpifs_by_uuid) {
> >          if (uuid_equals(&ofproto->uuid, uuid)) {
> >              return ofproto;
> > diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
> > index 191cfcb0d..fba03f2cc 100644
> > --- a/ofproto/ofproto-dpif.h
> > +++ b/ofproto/ofproto-dpif.h
> > @@ -295,7 +295,7 @@ struct ofproto_dpif {
> >      struct hmap_node all_ofproto_dpifs_by_name_node;
> >
> >      /* In 'all_ofproto_dpifs_by_uuid'. */
> > -    struct hmap_node all_ofproto_dpifs_by_uuid_node;
> > +    struct cmap_node all_ofproto_dpifs_by_uuid_node;
> >
> >      struct ofproto up;
> >      struct dpif_backer *backer;
> > --
> > 2.27.0
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
wangyunjian Jan. 18, 2022, 1:28 p.m. UTC | #3
http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.19884-1-guohongzhi1@huawei.com/
This patch has been applied. But this patch do not fix the problem I stated.
I think this problem is that hmap does not support concurrent access.

Thanks,
Yunjian

From: 贺鹏 [mailto:xnhp0320@gmail.com]
Sent: Tuesday, January 18, 2022 3:23 PM
To: wangyunjian <wangyunjian@huawei.com>
Cc: dev@openvswitch.org; i.maximets@ovn.org; dingxiaoxiong <dingxiaoxiong@huawei.com>
Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".

Hi, yunjian and Ilya,

this patch reminds me of the discussion we have in March last year, and during the discussion, you have spotted this
thread-safety issue of uuid map. Unfortunately, in that email, you did not reply to the mailist, so I cannot give a link to
the email. I attach it as a reference.

I quote it here:

"
That is an interesting example here...
I can't help but notice that this function is typically called
from different handler or pmd threads and modified by the main thread
while upcalls enabled.  And hmap is not a thread-safe structure.
I guess, we have another possible problem here.  We need to protect
at least this hmap and the other one with rw lock or something...
Am I right or am I missing something?  What else we need to protect?
"

And this patch fixes exactly the problem you stated.



wangyunjian via dev <ovs-dev@openvswitch.org<mailto:ovs-dev@openvswitch.org>> 于2022年1月8日周六 17:18写道:
Friendly ping.

> -----Original Message-----
> From: wangyunjian
> Sent: Friday, December 3, 2021 7:25 PM
> To: dev@openvswitch.org<mailto:dev@openvswitch.org>; i.maximets@ovn.org<mailto:i.maximets@ovn.org>
> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com<mailto:dingxiaoxiong@huawei.com>>; xudingke
> <xudingke@huawei.com<mailto:xudingke@huawei.com>>; wangyunjian <wangyunjian@huawei.com<mailto:wangyunjian@huawei.com>>; Justin
> Pettit <jpettit@ovn.org<mailto:jpettit@ovn.org>>
> Subject: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>
> When handler threads lookup a "ofproto" and use it, main thread maybe
> remove and free the "ofproto" at the same time. The "ofproto" has not
> been protected well, which can lead to an OVS crash.
>
> This patch fixes this by making the "ofproto" lookup RCU-safe by using
> cmap instead of hmap and moving remove "ofproto" call before
> xlate_txn_commit().
>
> (gdb) bt
>   hmap_next (hmap.h:398)
>   seq_wake_waiters (seq.c:326)
>   seq_change_protected (seq.c:134)
>   seq_change (seq.c:144)
>   ofproto_dpif_send_async_msg (ofproto_dpif.c:263)
>   process_upcall (ofproto_dpif_upcall.c:1782)
>   recv_upcalls (ofproto_dpif_upcall.c:1026)
>   udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945)
>   ovsthread_wrapper (ovs_thread.c:734)
>
> Cc: Justin Pettit <jpettit@ovn.org<mailto:jpettit@ovn.org>>
> Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user
> action cookie.")
> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com<mailto:wangyunjian@huawei.com>>
> ---
>  ofproto/ofproto-dpif.c | 12 ++++++------
>  ofproto/ofproto-dpif.h |  2 +-
>  2 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index cba49a99e..aa8d32f81 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -216,8 +216,7 @@ static struct hmap all_ofproto_dpifs_by_name =
>
> HMAP_INITIALIZER(&all_ofproto_dpifs_by_name);
>
>  /* All existing ofproto_dpif instances, indexed by ->uuid. */
> -static struct hmap all_ofproto_dpifs_by_uuid =
> -
> HMAP_INITIALIZER(&all_ofproto_dpifs_by_uuid);
> +static struct cmap all_ofproto_dpifs_by_uuid = CMAP_INITIALIZER;
>
>  static bool ofproto_use_tnl_push_pop = true;
>  static void ofproto_unixctl_init(void);
> @@ -1682,7 +1681,7 @@ construct(struct ofproto *ofproto_)
>      hmap_insert(&all_ofproto_dpifs_by_name,
>                  &ofproto->all_ofproto_dpifs_by_name_node,
>                  hash_string(ofproto->up.name<http://up.name>, 0));
> -    hmap_insert(&all_ofproto_dpifs_by_uuid,
> +    cmap_insert(&all_ofproto_dpifs_by_uuid,
>                  &ofproto->all_ofproto_dpifs_by_uuid_node,
>                  uuid_hash(&ofproto->uuid));
>      memset(&ofproto->stats, 0, sizeof ofproto->stats);
> @@ -1778,12 +1777,13 @@ destruct(struct ofproto *ofproto_, bool del)
>      ofproto->backer->need_revalidate = REV_RECONFIGURE;
>      xlate_txn_start();
>      xlate_remove_ofproto(ofproto);
> +    cmap_remove(&all_ofproto_dpifs_by_uuid,
> +                &ofproto->all_ofproto_dpifs_by_uuid_node,
> +                uuid_hash(&ofproto->uuid));

I guess some comments are needed here, you actually make use of
the rcu_synchronize in the xlate_txn_commit to avoid access to the
ofproto from other thread through uuid map.

>      xlate_txn_commit();
>
>      hmap_remove(&all_ofproto_dpifs_by_name,
>                  &ofproto->all_ofproto_dpifs_by_name_node);
> -    hmap_remove(&all_ofproto_dpifs_by_uuid,
> -                &ofproto->all_ofproto_dpifs_by_uuid_node);
>
>      OFPROTO_FOR_EACH_TABLE (table, &ofproto->up) {
>          CLS_FOR_EACH (rule, up.cr<http://up.cr>, &table->cls) {
> @@ -5781,7 +5781,7 @@ ofproto_dpif_lookup_by_uuid(const struct uuid
> *uuid)
>  {
>      struct ofproto_dpif *ofproto;
>
> -    HMAP_FOR_EACH_WITH_HASH (ofproto,
> all_ofproto_dpifs_by_uuid_node,
> +    CMAP_FOR_EACH_WITH_HASH (ofproto,
> all_ofproto_dpifs_by_uuid_node,
>                               uuid_hash(uuid),
> &all_ofproto_dpifs_by_uuid) {
>          if (uuid_equals(&ofproto->uuid, uuid)) {
>              return ofproto;
> diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
> index 191cfcb0d..fba03f2cc 100644
> --- a/ofproto/ofproto-dpif.h
> +++ b/ofproto/ofproto-dpif.h
> @@ -295,7 +295,7 @@ struct ofproto_dpif {
>      struct hmap_node all_ofproto_dpifs_by_name_node;
>
>      /* In 'all_ofproto_dpifs_by_uuid'. */
> -    struct hmap_node all_ofproto_dpifs_by_uuid_node;
> +    struct cmap_node all_ofproto_dpifs_by_uuid_node;
>
>      struct ofproto up;
>      struct dpif_backer *backer;
> --
> 2.27.0
Adrian Moreno Feb. 14, 2022, 12:16 p.m. UTC | #4
On 1/18/22 14:28, wangyunjian via dev wrote:
> http://patchwork.ozlabs.org/project/openvswitch/patch/20200306130555.19884-1-guohongzhi1@huawei.com/
> This patch has been applied. But this patch do not fix the problem I stated.
> I think this problem is that hmap does not support concurrent access.
> 
> Thanks,
> Yunjian
> 
> From: 贺鹏 [mailto:xnhp0320@gmail.com]
> Sent: Tuesday, January 18, 2022 3:23 PM
> To: wangyunjian <wangyunjian@huawei.com>
> Cc: dev@openvswitch.org; i.maximets@ovn.org; dingxiaoxiong <dingxiaoxiong@huawei.com>
> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> Hi, yunjian and Ilya,
> 
> this patch reminds me of the discussion we have in March last year, and during the discussion, you have spotted this
> thread-safety issue of uuid map. Unfortunately, in that email, you did not reply to the mailist, so I cannot give a link to
> the email. I attach it as a reference.
> 
> I quote it here:
> 
> "
> That is an interesting example here...
> I can't help but notice that this function is typically called
> from different handler or pmd threads and modified by the main thread
> while upcalls enabled.  And hmap is not a thread-safe structure.
> I guess, we have another possible problem here.  We need to protect
> at least this hmap and the other one with rw lock or something...
> Am I right or am I missing something?  What else we need to protect?
> "
> 
> And this patch fixes exactly the problem you stated.
> 
> 
> 
> wangyunjian via dev <ovs-dev@openvswitch.org<mailto:ovs-dev@openvswitch.org>> 于2022年1月8日周六 17:18写道:
> Friendly ping.
> 
>> -----Original Message-----
>> From: wangyunjian
>> Sent: Friday, December 3, 2021 7:25 PM
>> To: dev@openvswitch.org<mailto:dev@openvswitch.org>; i.maximets@ovn.org<mailto:i.maximets@ovn.org>
>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com<mailto:dingxiaoxiong@huawei.com>>; xudingke
>> <xudingke@huawei.com<mailto:xudingke@huawei.com>>; wangyunjian <wangyunjian@huawei.com<mailto:wangyunjian@huawei.com>>; Justin
>> Pettit <jpettit@ovn.org<mailto:jpettit@ovn.org>>
>> Subject: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>
>> When handler threads lookup a "ofproto" and use it, main thread maybe
>> remove and free the "ofproto" at the same time. The "ofproto" has not
>> been protected well, which can lead to an OVS crash.
>>
>> This patch fixes this by making the "ofproto" lookup RCU-safe by using
>> cmap instead of hmap and moving remove "ofproto" call before
>> xlate_txn_commit().
>>
>> (gdb) bt
>>    hmap_next (hmap.h:398)
>>    seq_wake_waiters (seq.c:326)
>>    seq_change_protected (seq.c:134)
>>    seq_change (seq.c:144)
>>    ofproto_dpif_send_async_msg (ofproto_dpif.c:263)
>>    process_upcall (ofproto_dpif_upcall.c:1782)
>>    recv_upcalls (ofproto_dpif_upcall.c:1026)
>>    udpif_upcall_handler (ofproto/ofproto_dpif_upcall.c:945)
>>    ovsthread_wrapper (ovs_thread.c:734)
>>
>> Cc: Justin Pettit <jpettit@ovn.org<mailto:jpettit@ovn.org>>
>> Fixes: fcb9579be3c7 ("ofproto: Add 'ofproto_uuid' and 'ofp_in_port' to user
>> action cookie.")
>> Signed-off-by: Yunjian Wang <wangyunjian@huawei.com<mailto:wangyunjian@huawei.com>>
>> ---
>>   ofproto/ofproto-dpif.c | 12 ++++++------
>>   ofproto/ofproto-dpif.h |  2 +-
>>   2 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
>> index cba49a99e..aa8d32f81 100644
>> --- a/ofproto/ofproto-dpif.c
>> +++ b/ofproto/ofproto-dpif.c
>> @@ -216,8 +216,7 @@ static struct hmap all_ofproto_dpifs_by_name =
>>
>> HMAP_INITIALIZER(&all_ofproto_dpifs_by_name);
>>
>>   /* All existing ofproto_dpif instances, indexed by ->uuid. */
>> -static struct hmap all_ofproto_dpifs_by_uuid =
>> -
>> HMAP_INITIALIZER(&all_ofproto_dpifs_by_uuid);
>> +static struct cmap all_ofproto_dpifs_by_uuid = CMAP_INITIALIZER;
>>
>>   static bool ofproto_use_tnl_push_pop = true;
>>   static void ofproto_unixctl_init(void);
>> @@ -1682,7 +1681,7 @@ construct(struct ofproto *ofproto_)
>>       hmap_insert(&all_ofproto_dpifs_by_name,
>>                   &ofproto->all_ofproto_dpifs_by_name_node,
>>                   hash_string(ofproto->up.name<http://up.name>, 0));
>> -    hmap_insert(&all_ofproto_dpifs_by_uuid,
>> +    cmap_insert(&all_ofproto_dpifs_by_uuid,
>>                   &ofproto->all_ofproto_dpifs_by_uuid_node,
>>                   uuid_hash(&ofproto->uuid));
>>       memset(&ofproto->stats, 0, sizeof ofproto->stats);
>> @@ -1778,12 +1777,13 @@ destruct(struct ofproto *ofproto_, bool del)
>>       ofproto->backer->need_revalidate = REV_RECONFIGURE;
>>       xlate_txn_start();
>>       xlate_remove_ofproto(ofproto);
>> +    cmap_remove(&all_ofproto_dpifs_by_uuid,
>> +                &ofproto->all_ofproto_dpifs_by_uuid_node,
>> +                uuid_hash(&ofproto->uuid));
> 
> I guess some comments are needed here, you actually make use of
> the rcu_synchronize in the xlate_txn_commit to avoid access to the
> ofproto from other thread through uuid map.

Relying on the synchronization mechanism of another data structure (in this 
case, the struct xlate_cfg) is a bit strange. For any change that we do on 
xlate_cfg's synchronization now we have to consider that it can affect 
all_ofproto_dpifs_by_uuid_node.

Have you considered adding "all_ofproto_dpifs_by_uuid_node" to "struct xlate_cfg"?

> 
>>       xlate_txn_commit();
>>
>>       hmap_remove(&all_ofproto_dpifs_by_name,
>>                   &ofproto->all_ofproto_dpifs_by_name_node);
>> -    hmap_remove(&all_ofproto_dpifs_by_uuid,
>> -                &ofproto->all_ofproto_dpifs_by_uuid_node);
>>
>>       OFPROTO_FOR_EACH_TABLE (table, &ofproto->up) {
>>           CLS_FOR_EACH (rule, up.cr<http://up.cr>, &table->cls) {
>> @@ -5781,7 +5781,7 @@ ofproto_dpif_lookup_by_uuid(const struct uuid
>> *uuid)
>>   {
>>       struct ofproto_dpif *ofproto;
>>
>> -    HMAP_FOR_EACH_WITH_HASH (ofproto,
>> all_ofproto_dpifs_by_uuid_node,
>> +    CMAP_FOR_EACH_WITH_HASH (ofproto,
>> all_ofproto_dpifs_by_uuid_node,
>>                                uuid_hash(uuid),
>> &all_ofproto_dpifs_by_uuid) {
>>           if (uuid_equals(&ofproto->uuid, uuid)) {
>>               return ofproto;
>> diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
>> index 191cfcb0d..fba03f2cc 100644
>> --- a/ofproto/ofproto-dpif.h
>> +++ b/ofproto/ofproto-dpif.h
>> @@ -295,7 +295,7 @@ struct ofproto_dpif {
>>       struct hmap_node all_ofproto_dpifs_by_name_node;
>>
>>       /* In 'all_ofproto_dpifs_by_uuid'. */
>> -    struct hmap_node all_ofproto_dpifs_by_uuid_node;
>> +    struct cmap_node all_ofproto_dpifs_by_uuid_node;
>>
>>       struct ofproto up;
>>       struct dpif_backer *backer;
>> --
>> 2.27.0
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org<mailto:dev@openvswitch.org>
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> 
> --
> hepeng
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gaetan Rivet Feb. 16, 2022, 11:34 a.m. UTC | #5
On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
> When handler threads lookup a "ofproto" and use it, main thread maybe
> remove and free the "ofproto" at the same time. The "ofproto" has not
> been protected well, which can lead to an OVS crash.
>
> This patch fixes this by making the "ofproto" lookup RCU-safe by using
> cmap instead of hmap and moving remove "ofproto" call before
> xlate_txn_commit().
>

I don't understand the point of moving the cmap_remove() call before
xlate_txn_commit().

It is 'nice-to-have', maybe. In essence, it is about making sure the ofproto
reference cannot be held by another thread while proceeding with the destruction.
It simplifies the execution pattern for such destruction.

But it seems other systems relying on ofproto access were written with the
possibility that it is currently in the process of being destroyed. Its freeing
is deferred, rules and groups are meant to proceed within this grace period.
Granted, there is currently a bug in the rule management, but this is a bug
and it is being fixed.

So while it is correct, and while it simplifies the mental model when looking
at the lifetime of ofproto, it does not seem necessary? Am I mistaken?

If it is necessary, it would be better to be explicit about it. If multiple levels
of the object rely on RCU sync, a single overarching call with a proper comment
would be safer to maintain. As it concerns destruct() safety, I think it is
worth having it explicitly at that level. It makes the fix more complex and more dangerous
with changes in xlate implementation however.
wangyunjian Feb. 16, 2022, 1:24 p.m. UTC | #6
> -----Original Message-----
> From: Gaëtan Rivet [mailto:grive@u256.net]
> Sent: Wednesday, February 16, 2022 7:34 PM
> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
> > When handler threads lookup a "ofproto" and use it, main thread maybe
> > remove and free the "ofproto" at the same time. The "ofproto" has not
> > been protected well, which can lead to an OVS crash.
> >
> > This patch fixes this by making the "ofproto" lookup RCU-safe by using
> > cmap instead of hmap and moving remove "ofproto" call before
> > xlate_txn_commit().
> >
> 
> I don't understand the point of moving the cmap_remove() call before
> xlate_txn_commit().

To use of the rcu_synchronize in the xlate_txn_commit to avoid access to the
ofproto from other thread through uuid map.

> 
> It is 'nice-to-have', maybe. In essence, it is about making sure the ofproto
> reference cannot be held by another thread while proceeding with the
> destruction.
> It simplifies the execution pattern for such destruction.
> 
> But it seems other systems relying on ofproto access were written with the
> possibility that it is currently in the process of being destroyed. Its freeing is
> deferred, rules and groups are meant to proceed within this grace period.
> Granted, there is currently a bug in the rule management, but this is a bug and it
> is being fixed.
> 
> So while it is correct, and while it simplifies the mental model when looking at the
> lifetime of ofproto, it does not seem necessary? Am I mistaken?
> 
> If it is necessary, it would be better to be explicit about it. If multiple levels of the
> object rely on RCU sync, a single overarching call with a proper comment would
> be safer to maintain. As it concerns destruct() safety, I think it is worth having it
> explicitly at that level. It makes the fix more complex and more dangerous with
> changes in xlate implementation however.

I will add "all_ofproto_dpifs_by_uuid_node" to "struct xlate_cfg" to avoid this issue
according to Adrian Moreno's suggestion.

Thanks,
Yunjian

> 
> --
> Gaetan Rivet
Gaetan Rivet Feb. 16, 2022, 5:28 p.m. UTC | #7
On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>> -----Original Message-----
>> From: Gaëtan Rivet [mailto:grive@u256.net]
>> Sent: Wednesday, February 16, 2022 7:34 PM
>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> 
>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>> > When handler threads lookup a "ofproto" and use it, main thread maybe
>> > remove and free the "ofproto" at the same time. The "ofproto" has not
>> > been protected well, which can lead to an OVS crash.
>> >
>> > This patch fixes this by making the "ofproto" lookup RCU-safe by using
>> > cmap instead of hmap and moving remove "ofproto" call before
>> > xlate_txn_commit().
>> >
>> 
>> I don't understand the point of moving the cmap_remove() call before
>> xlate_txn_commit().
>
> To use of the rcu_synchronize in the xlate_txn_commit to avoid access to the
> ofproto from other thread through uuid map.
>

Yes the reason is clear.

But my question is why is it needed? It seems that the ofproto lifecycle
was written with the assumption that it would still be used while being destroyed.

Can you explain why it needs to be changed?
wangyunjian Feb. 17, 2022, 3:27 a.m. UTC | #8
> -----Original Message-----
> From: Gaëtan Rivet [mailto:grive@u256.net]
> Sent: Thursday, February 17, 2022 1:29 AM
> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>; 贺鹏
> <xnhp0320@gmail.com>
> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
> >> -----Original Message-----
> >> From: Gaëtan Rivet [mailto:grive@u256.net]
> >> Sent: Wednesday, February 16, 2022 7:34 PM
> >> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> >>
> >> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
> >> > When handler threads lookup a "ofproto" and use it, main thread
> >> > maybe remove and free the "ofproto" at the same time. The "ofproto"
> >> > has not been protected well, which can lead to an OVS crash.
> >> >
> >> > This patch fixes this by making the "ofproto" lookup RCU-safe by
> >> > using cmap instead of hmap and moving remove "ofproto" call before
> >> > xlate_txn_commit().
> >> >
> >>
> >> I don't understand the point of moving the cmap_remove() call before
> >> xlate_txn_commit().
> >
> > To use of the rcu_synchronize in the xlate_txn_commit to avoid access
> > to the ofproto from other thread through uuid map.
> >
> 
> Yes the reason is clear.
> 
> But my question is why is it needed? It seems that the ofproto lifecycle was
> written with the assumption that it would still be used while being destroyed.
> 
> Can you explain why it needs to be changed?

I didn't describe the problem clearly before. The main problem is that hmap variable
is not thread safe. The all_ofproto_dpifs_by_uuid variable uses the hmap structure,
which maybe be accessed by main thread and handler threads.

Thanks,
Yunjian

> 
> --
> Gaetan Rivet
wangyunjian Feb. 17, 2022, 7:29 a.m. UTC | #9
> -----Original Message-----
> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
> wangyunjian via dev
> Sent: Thursday, February 17, 2022 11:27 AM
> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> 
> 
> > -----Original Message-----
> > From: Gaëtan Rivet [mailto:grive@u256.net]
> > Sent: Thursday, February 17, 2022 1:29 AM
> > To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> > <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> > Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>; 贺
> 鹏
> > <xnhp0320@gmail.com>
> > Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> >
> > On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
> > >> -----Original Message-----
> > >> From: Gaëtan Rivet [mailto:grive@u256.net]
> > >> Sent: Wednesday, February 16, 2022 7:34 PM
> > >> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> > >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> > >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> > >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> > >>
> > >> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
> > >> > When handler threads lookup a "ofproto" and use it, main thread
> > >> > maybe remove and free the "ofproto" at the same time. The "ofproto"
> > >> > has not been protected well, which can lead to an OVS crash.
> > >> >
> > >> > This patch fixes this by making the "ofproto" lookup RCU-safe by
> > >> > using cmap instead of hmap and moving remove "ofproto" call
> > >> > before xlate_txn_commit().
> > >> >
> > >>
> > >> I don't understand the point of moving the cmap_remove() call
> > >> before xlate_txn_commit().
> > >
> > > To use of the rcu_synchronize in the xlate_txn_commit to avoid
> > > access to the ofproto from other thread through uuid map.
> > >
> >
> > Yes the reason is clear.
> >
> > But my question is why is it needed? It seems that the ofproto
> > lifecycle was written with the assumption that it would still be used while being
> destroyed.
> >
> > Can you explain why it needs to be changed?
> 
> I didn't describe the problem clearly before. The main problem is that hmap
> variable is not thread safe. The all_ofproto_dpifs_by_uuid variable uses the
> hmap structure, which maybe be accessed by main thread and handler threads.

I change the description to "ofproto: fix concurrent when looking up an ofproto by UUID.".
And it maybe be clearly understood. How about this?

Thanks,
Yunjian

> 
> Thanks,
> Yunjian
> 
> >
> > --
> > Gaetan Rivet
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gaetan Rivet Feb. 17, 2022, 1:30 p.m. UTC | #10
On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>> -----Original Message-----
>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>> wangyunjian via dev
>> Sent: Thursday, February 17, 2022 11:27 AM
>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> 
>> 
>> 
>> > -----Original Message-----
>> > From: Gaëtan Rivet [mailto:grive@u256.net]
>> > Sent: Thursday, February 17, 2022 1:29 AM
>> > To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>> > <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> > Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>; 贺
>> 鹏
>> > <xnhp0320@gmail.com>
>> > Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> >
>> > On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>> > >> -----Original Message-----
>> > >> From: Gaëtan Rivet [mailto:grive@u256.net]
>> > >> Sent: Wednesday, February 16, 2022 7:34 PM
>> > >> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>> > >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> > >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> > >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> > >>
>> > >> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>> > >> > When handler threads lookup a "ofproto" and use it, main thread
>> > >> > maybe remove and free the "ofproto" at the same time. The "ofproto"
>> > >> > has not been protected well, which can lead to an OVS crash.
>> > >> >
>> > >> > This patch fixes this by making the "ofproto" lookup RCU-safe by
>> > >> > using cmap instead of hmap and moving remove "ofproto" call
>> > >> > before xlate_txn_commit().
>> > >> >
>> > >>
>> > >> I don't understand the point of moving the cmap_remove() call
>> > >> before xlate_txn_commit().
>> > >
>> > > To use of the rcu_synchronize in the xlate_txn_commit to avoid
>> > > access to the ofproto from other thread through uuid map.
>> > >
>> >
>> > Yes the reason is clear.
>> >
>> > But my question is why is it needed? It seems that the ofproto
>> > lifecycle was written with the assumption that it would still be used while being
>> destroyed.
>> >
>> > Can you explain why it needs to be changed?
>> 
>> I didn't describe the problem clearly before. The main problem is that hmap
>> variable is not thread safe. The all_ofproto_dpifs_by_uuid variable uses the
>> hmap structure, which maybe be accessed by main thread and handler threads.

I'm ok on the part switching to using CMAP to allow concurrent reads.
That I see the reason and it is fine.

The part that I don't understand is moving the cmap_remove() call before the RCU sync.

As far as I know, the CMAP type does not require that to safely operate.
The writer thread is allowed to call cmap_remove() while a reader is iterating on
the CMAP to find a node. The only precaution needed is that actual destruction of the
node (freeing) is deferred, which it is.

So I don't see the reason to move cmap_remove() before the RCU synchronization, instead
of keeping it as it is now. Could you please explain your reasoning?
wangyunjian Feb. 17, 2022, 2:08 p.m. UTC | #11
> -----Original Message-----
> From: Gaëtan Rivet [mailto:grive@u256.net]
> Sent: Thursday, February 17, 2022 9:31 PM
> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
> <xnhp0320@gmail.com>; amorenoz@redhat.com
> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
> >> -----Original Message-----
> >> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
> >> wangyunjian via dev
> >> Sent: Thursday, February 17, 2022 11:27 AM
> >> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
> >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> >>
> >>
> >>
> >> > -----Original Message-----
> >> > From: Gaëtan Rivet [mailto:grive@u256.net]
> >> > Sent: Thursday, February 17, 2022 1:29 AM
> >> > To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> >> > <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> >> > Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
> >> > 贺
> >> 鹏
> >> > <xnhp0320@gmail.com>
> >> > Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> >> >
> >> > On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
> >> > >> -----Original Message-----
> >> > >> From: Gaëtan Rivet [mailto:grive@u256.net]
> >> > >> Sent: Wednesday, February 16, 2022 7:34 PM
> >> > >> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
> >> > >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> >> > >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> >> > >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> >> > >>
> >> > >> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
> >> > >> > When handler threads lookup a "ofproto" and use it, main
> >> > >> > thread maybe remove and free the "ofproto" at the same time. The
> "ofproto"
> >> > >> > has not been protected well, which can lead to an OVS crash.
> >> > >> >
> >> > >> > This patch fixes this by making the "ofproto" lookup RCU-safe
> >> > >> > by using cmap instead of hmap and moving remove "ofproto" call
> >> > >> > before xlate_txn_commit().
> >> > >> >
> >> > >>
> >> > >> I don't understand the point of moving the cmap_remove() call
> >> > >> before xlate_txn_commit().
> >> > >
> >> > > To use of the rcu_synchronize in the xlate_txn_commit to avoid
> >> > > access to the ofproto from other thread through uuid map.
> >> > >
> >> >
> >> > Yes the reason is clear.
> >> >
> >> > But my question is why is it needed? It seems that the ofproto
> >> > lifecycle was written with the assumption that it would still be
> >> > used while being
> >> destroyed.
> >> >
> >> > Can you explain why it needs to be changed?
> >>
> >> I didn't describe the problem clearly before. The main problem is
> >> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
> >> variable uses the hmap structure, which maybe be accessed by main thread
> and handler threads.
> 
> I'm ok on the part switching to using CMAP to allow concurrent reads.
> That I see the reason and it is fine.
> 
> The part that I don't understand is moving the cmap_remove() call before the
> RCU sync.
> 
> As far as I know, the CMAP type does not require that to safely operate.
> The writer thread is allowed to call cmap_remove() while a reader is iterating on
> the CMAP to find a node. The only precaution needed is that actual destruction
> of the node (freeing) is deferred, which it is.
> 
> So I don't see the reason to move cmap_remove() before the RCU
> synchronization, instead of keeping it as it is now. Could you please explain your
> reasoning?

I consider it is more reasonable for the upcall thread cannot find the ofproto
when the ofproto will be deleted, no other special consideration.

Thanks,
Yunjian
> 
> --
> Gaetan Rivet
Gaetan Rivet Feb. 17, 2022, 3:15 p.m. UTC | #12
On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>> -----Original Message-----
>> From: Gaëtan Rivet [mailto:grive@u256.net]
>> Sent: Thursday, February 17, 2022 9:31 PM
>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> 
>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>> >> -----Original Message-----
>> >> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>> >> wangyunjian via dev
>> >> Sent: Thursday, February 17, 2022 11:27 AM
>> >> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>> >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> >>
>> >>
>> >>
>> >> > -----Original Message-----
>> >> > From: Gaëtan Rivet [mailto:grive@u256.net]
>> >> > Sent: Thursday, February 17, 2022 1:29 AM
>> >> > To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>> >> > <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> >> > Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>> >> > 贺
>> >> 鹏
>> >> > <xnhp0320@gmail.com>
>> >> > Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> >> >
>> >> > On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>> >> > >> -----Original Message-----
>> >> > >> From: Gaëtan Rivet [mailto:grive@u256.net]
>> >> > >> Sent: Wednesday, February 16, 2022 7:34 PM
>> >> > >> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>> >> > >> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> >> > >> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> >> > >> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> >> > >>
>> >> > >> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>> >> > >> > When handler threads lookup a "ofproto" and use it, main
>> >> > >> > thread maybe remove and free the "ofproto" at the same time. The
>> "ofproto"
>> >> > >> > has not been protected well, which can lead to an OVS crash.
>> >> > >> >
>> >> > >> > This patch fixes this by making the "ofproto" lookup RCU-safe
>> >> > >> > by using cmap instead of hmap and moving remove "ofproto" call
>> >> > >> > before xlate_txn_commit().
>> >> > >> >
>> >> > >>
>> >> > >> I don't understand the point of moving the cmap_remove() call
>> >> > >> before xlate_txn_commit().
>> >> > >
>> >> > > To use of the rcu_synchronize in the xlate_txn_commit to avoid
>> >> > > access to the ofproto from other thread through uuid map.
>> >> > >
>> >> >
>> >> > Yes the reason is clear.
>> >> >
>> >> > But my question is why is it needed? It seems that the ofproto
>> >> > lifecycle was written with the assumption that it would still be
>> >> > used while being
>> >> destroyed.
>> >> >
>> >> > Can you explain why it needs to be changed?
>> >>
>> >> I didn't describe the problem clearly before. The main problem is
>> >> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>> >> variable uses the hmap structure, which maybe be accessed by main thread
>> and handler threads.
>> 
>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>> That I see the reason and it is fine.
>> 
>> The part that I don't understand is moving the cmap_remove() call before the
>> RCU sync.
>> 
>> As far as I know, the CMAP type does not require that to safely operate.
>> The writer thread is allowed to call cmap_remove() while a reader is iterating on
>> the CMAP to find a node. The only precaution needed is that actual destruction
>> of the node (freeing) is deferred, which it is.
>> 
>> So I don't see the reason to move cmap_remove() before the RCU
>> synchronization, instead of keeping it as it is now. Could you please explain your
>> reasoning?
>
> I consider it is more reasonable for the upcall thread cannot find the ofproto
> when the ofproto will be deleted, no other special consideration.
>
> Thanks,
> Yunjian

That might be an interesting change to clean up the 'execution context' of the ofproto destruction.

But I think it is out of scope for this fix. It means changing the xlate object,
introduce coupling between the ofproto map and the xlate layer. This goes beyond
fixing undefined behavior due to concurrent accesses on the map.
Adrian Moreno Feb. 17, 2022, 3:40 p.m. UTC | #13
On 2/17/22 16:15, Gaëtan Rivet wrote:
> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>>> -----Original Message-----
>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>> Sent: Thursday, February 17, 2022 9:31 PM
>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>
>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>>>>> -----Original Message-----
>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>>>>> wangyunjian via dev
>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>
>>>>>
>>>>>
>>>>>> -----Original Message-----
>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>>>>>> 贺
>>>>> 鹏
>>>>>> <xnhp0320@gmail.com>
>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>
>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>>>>>>>> -----Original Message-----
>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>
>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>>>>>>>>> thread maybe remove and free the "ofproto" at the same time. The
>>> "ofproto"
>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>>>>>>>>>
>>>>>>>>> This patch fixes this by making the "ofproto" lookup RCU-safe
>>>>>>>>> by using cmap instead of hmap and moving remove "ofproto" call
>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>
>>>>>>>>
>>>>>>>> I don't understand the point of moving the cmap_remove() call
>>>>>>>> before xlate_txn_commit().
>>>>>>>
>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to avoid
>>>>>>> access to the ofproto from other thread through uuid map.
>>>>>>>
>>>>>>
>>>>>> Yes the reason is clear.
>>>>>>
>>>>>> But my question is why is it needed? It seems that the ofproto
>>>>>> lifecycle was written with the assumption that it would still be
>>>>>> used while being
>>>>> destroyed.
>>>>>>
>>>>>> Can you explain why it needs to be changed?
>>>>>
>>>>> I didn't describe the problem clearly before. The main problem is
>>>>> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>>>>> variable uses the hmap structure, which maybe be accessed by main thread
>>> and handler threads.
>>>
>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>>> That I see the reason and it is fine.
>>>
>>> The part that I don't understand is moving the cmap_remove() call before the
>>> RCU sync.
>>>
>>> As far as I know, the CMAP type does not require that to safely operate.
>>> The writer thread is allowed to call cmap_remove() while a reader is iterating on
>>> the CMAP to find a node. The only precaution needed is that actual destruction
>>> of the node (freeing) is deferred, which it is.
>>>
>>> So I don't see the reason to move cmap_remove() before the RCU
>>> synchronization, instead of keeping it as it is now. Could you please explain your
>>> reasoning?
>>
>> I consider it is more reasonable for the upcall thread cannot find the ofproto
>> when the ofproto will be deleted, no other special consideration.
>>
>> Thanks,
>> Yunjian
> 
> That might be an interesting change to clean up the 'execution context' of the ofproto destruction.
> 
> But I think it is out of scope for this fix. It means changing the xlate object,
> introduce coupling between the ofproto map and the xlate layer. This goes beyond
> fixing undefined behavior due to concurrent accesses on the map.
> 

Hi Gaëtan,

I think the root cause of the crash this fix pretends to fix is not the 
concurrent access to the map (though that is also an issue, of course) but the 
upcall using an ofproto-dpif object after the main thread has run its 
destruct(). So I think we need to fix both.

With regards to the way of fixing it, what do you think about removing 
all_ofproto_dpifs_by_uuid altogether and using xbridge_lookup_by_uuid() intead?

Thanks
Gaetan Rivet Feb. 18, 2022, 11:18 a.m. UTC | #14
On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
> On 2/17/22 16:15, Gaëtan Rivet wrote:
>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>>>> -----Original Message-----
>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>> Sent: Thursday, February 17, 2022 9:31 PM
>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>
>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>>>>>> -----Original Message-----
>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>>>>>> wangyunjian via dev
>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>
>>>>>>
>>>>>>
>>>>>>> -----Original Message-----
>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>>>>>>> 贺
>>>>>> 鹏
>>>>>>> <xnhp0320@gmail.com>
>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>
>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>
>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>>>>>>>>>> thread maybe remove and free the "ofproto" at the same time. The
>>>> "ofproto"
>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>>>>>>>>>>
>>>>>>>>>> This patch fixes this by making the "ofproto" lookup RCU-safe
>>>>>>>>>> by using cmap instead of hmap and moving remove "ofproto" call
>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I don't understand the point of moving the cmap_remove() call
>>>>>>>>> before xlate_txn_commit().
>>>>>>>>
>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to avoid
>>>>>>>> access to the ofproto from other thread through uuid map.
>>>>>>>>
>>>>>>>
>>>>>>> Yes the reason is clear.
>>>>>>>
>>>>>>> But my question is why is it needed? It seems that the ofproto
>>>>>>> lifecycle was written with the assumption that it would still be
>>>>>>> used while being
>>>>>> destroyed.
>>>>>>>
>>>>>>> Can you explain why it needs to be changed?
>>>>>>
>>>>>> I didn't describe the problem clearly before. The main problem is
>>>>>> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>>>>>> variable uses the hmap structure, which maybe be accessed by main thread
>>>> and handler threads.
>>>>
>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>>>> That I see the reason and it is fine.
>>>>
>>>> The part that I don't understand is moving the cmap_remove() call before the
>>>> RCU sync.
>>>>
>>>> As far as I know, the CMAP type does not require that to safely operate.
>>>> The writer thread is allowed to call cmap_remove() while a reader is iterating on
>>>> the CMAP to find a node. The only precaution needed is that actual destruction
>>>> of the node (freeing) is deferred, which it is.
>>>>
>>>> So I don't see the reason to move cmap_remove() before the RCU
>>>> synchronization, instead of keeping it as it is now. Could you please explain your
>>>> reasoning?
>>>
>>> I consider it is more reasonable for the upcall thread cannot find the ofproto
>>> when the ofproto will be deleted, no other special consideration.
>>>
>>> Thanks,
>>> Yunjian
>> 
>> That might be an interesting change to clean up the 'execution context' of the ofproto destruction.
>> 
>> But I think it is out of scope for this fix. It means changing the xlate object,
>> introduce coupling between the ofproto map and the xlate layer. This goes beyond
>> fixing undefined behavior due to concurrent accesses on the map.
>> 
>
> Hi Gaëtan,
>
> I think the root cause of the crash this fix pretends to fix is not the 
> concurrent access to the map (though that is also an issue, of course) but the 
> upcall using an ofproto-dpif object after the main thread has run its 
> destruct(). So I think we need to fix both.
>
> With regards to the way of fixing it, what do you think about removing 
> all_ofproto_dpifs_by_uuid altogether and using xbridge_lookup_by_uuid() intead?
>
> Thanks
> -- 
> Adrián Moreno

Hi Adrián,

Thanks for explaining, it's clear now.
Your suggestion is interesting, it seems correct.

However, it means changing xcfg->xbridges into a cmap. It requires
xlate_xbridge_remove() itself to be RCU compatible. Although there is the
RCU synchronization during xlate_txn_commit(), there is still a window between
xlate_remove_ofproto() and xlate_txn_commit() during which a thread will lookup
into this CMAP and get an xbridge reference.

Maybe it only means having to ovsrcu_postpone(free, xbridge) in xlate_xbridge_remove(),
but it seems dangerous to me to open an execution context to concurrency for an object
that was not initially written for it, in the context of a bug fix.

In any case, I understand the purpose of the whole patch now.
Maybe moving the cmap_remove() before xlate_txn_commit(), with a proper comment
fully justifying it, is better for a fix that is meant to be backported?

Best regards,
Adrian Moreno Feb. 21, 2022, 7:44 a.m. UTC | #15
On 2/18/22 12:18, Gaëtan Rivet wrote:
> On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
>> On 2/17/22 16:15, Gaëtan Rivet wrote:
>>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>>>>> -----Original Message-----
>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>> Sent: Thursday, February 17, 2022 9:31 PM
>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>
>>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>>>>>>> -----Original Message-----
>>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>>>>>>> wangyunjian via dev
>>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>> -----Original Message-----
>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>>>>>>>> 贺
>>>>>>> 鹏
>>>>>>>> <xnhp0320@gmail.com>
>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>
>>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>>
>>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>>>>>>>>>>> thread maybe remove and free the "ofproto" at the same time. The
>>>>> "ofproto"
>>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>>>>>>>>>>>
>>>>>>>>>>> This patch fixes this by making the "ofproto" lookup RCU-safe
>>>>>>>>>>> by using cmap instead of hmap and moving remove "ofproto" call
>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I don't understand the point of moving the cmap_remove() call
>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>
>>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to avoid
>>>>>>>>> access to the ofproto from other thread through uuid map.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Yes the reason is clear.
>>>>>>>>
>>>>>>>> But my question is why is it needed? It seems that the ofproto
>>>>>>>> lifecycle was written with the assumption that it would still be
>>>>>>>> used while being
>>>>>>> destroyed.
>>>>>>>>
>>>>>>>> Can you explain why it needs to be changed?
>>>>>>>
>>>>>>> I didn't describe the problem clearly before. The main problem is
>>>>>>> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>>>>>>> variable uses the hmap structure, which maybe be accessed by main thread
>>>>> and handler threads.
>>>>>
>>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>>>>> That I see the reason and it is fine.
>>>>>
>>>>> The part that I don't understand is moving the cmap_remove() call before the
>>>>> RCU sync.
>>>>>
>>>>> As far as I know, the CMAP type does not require that to safely operate.
>>>>> The writer thread is allowed to call cmap_remove() while a reader is iterating on
>>>>> the CMAP to find a node. The only precaution needed is that actual destruction
>>>>> of the node (freeing) is deferred, which it is.
>>>>>
>>>>> So I don't see the reason to move cmap_remove() before the RCU
>>>>> synchronization, instead of keeping it as it is now. Could you please explain your
>>>>> reasoning?
>>>>
>>>> I consider it is more reasonable for the upcall thread cannot find the ofproto
>>>> when the ofproto will be deleted, no other special consideration.
>>>>
>>>> Thanks,
>>>> Yunjian
>>>
>>> That might be an interesting change to clean up the 'execution context' of the ofproto destruction.
>>>
>>> But I think it is out of scope for this fix. It means changing the xlate object,
>>> introduce coupling between the ofproto map and the xlate layer. This goes beyond
>>> fixing undefined behavior due to concurrent accesses on the map.
>>>
>>
>> Hi Gaëtan,
>>
>> I think the root cause of the crash this fix pretends to fix is not the
>> concurrent access to the map (though that is also an issue, of course) but the
>> upcall using an ofproto-dpif object after the main thread has run its
>> destruct(). So I think we need to fix both.
>>
>> With regards to the way of fixing it, what do you think about removing
>> all_ofproto_dpifs_by_uuid altogether and using xbridge_lookup_by_uuid() intead?
>>
>> Thanks
>> -- 
>> Adrián Moreno
> 
> Hi Adrián,
> 
> Thanks for explaining, it's clear now.
> Your suggestion is interesting, it seems correct.
> 
> However, it means changing xcfg->xbridges into a cmap. It requires
> xlate_xbridge_remove() itself to be RCU compatible. Although there is the
> RCU synchronization during xlate_txn_commit(), there is still a window between
> xlate_remove_ofproto() and xlate_txn_commit() during which a thread will lookup
> into this CMAP and get an xbridge reference.
> 

I don't think we would need xcfg->xbridges to be a cmap. Concurrency of the xcfg 
object is handled by the xlate_txn_start() / xlate_txn_commit() mechanism.
The entire xcfg is copied to new_xcfg, modifications are performed in next_cfg 
and then xcfg is replaced. Copy and replacement of the entire structure are rcu 
protected.

So, I think the hmap itself is safe to access and since xlate_txn_commit() calls 
ovsrcu_synchronize(), upcall threads won't have references to anything in the 
previous configuration.

So, AFAICS, this suggestion is just removing code, not adding it.

However, there is a potentially important behavioral change: the xbridge hmap 
uses the ofproto pointer as hash while the current map uses the uuid. So the 
lookup will be slower. Hence my original thought of just adding 
all_ofproto_dpifs_by_uuid to xcfg.

> Maybe it only means having to ovsrcu_postpone(free, xbridge) in xlate_xbridge_remove(),
> but it seems dangerous to me to open an execution context to concurrency for an object
> that was not initially written for it, in the context of a bug fix.
> 
> In any case, I understand the purpose of the whole patch now.
> Maybe moving the cmap_remove() before xlate_txn_commit(), with a proper comment
> fully justifying it, is better for a fix that is meant to be backported?
> 

I agree that we should look for a fix with low intrusiveness for backporting. 
However, for the master branch I believe we should prioritize robustness. And 
having an object (all_ofproto_dpifs_by_uuid) silently use the synchronization 
mechanism of another unrelated object (xcfg) seems to me as more error prone. WDYT?

> Best regards,
Gaetan Rivet Feb. 21, 2022, 10:42 a.m. UTC | #16
On Mon, Feb 21, 2022, at 08:44, Adrian Moreno wrote:
> On 2/18/22 12:18, Gaëtan Rivet wrote:
>> On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
>>> On 2/17/22 16:15, Gaëtan Rivet wrote:
>>>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>>>>>> -----Original Message-----
>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>> Sent: Thursday, February 17, 2022 9:31 PM
>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>>>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>
>>>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>>>>>>>> -----Original Message-----
>>>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>>>>>>>> wangyunjian via dev
>>>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>>>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>>>>>>>>> 贺
>>>>>>>> 鹏
>>>>>>>>> <xnhp0320@gmail.com>
>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>
>>>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>>>
>>>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>>>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>>>>>>>>>>>> thread maybe remove and free the "ofproto" at the same time. The
>>>>>> "ofproto"
>>>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>>>>>>>>>>>>
>>>>>>>>>>>> This patch fixes this by making the "ofproto" lookup RCU-safe
>>>>>>>>>>>> by using cmap instead of hmap and moving remove "ofproto" call
>>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I don't understand the point of moving the cmap_remove() call
>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>
>>>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to avoid
>>>>>>>>>> access to the ofproto from other thread through uuid map.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Yes the reason is clear.
>>>>>>>>>
>>>>>>>>> But my question is why is it needed? It seems that the ofproto
>>>>>>>>> lifecycle was written with the assumption that it would still be
>>>>>>>>> used while being
>>>>>>>> destroyed.
>>>>>>>>>
>>>>>>>>> Can you explain why it needs to be changed?
>>>>>>>>
>>>>>>>> I didn't describe the problem clearly before. The main problem is
>>>>>>>> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>>>>>>>> variable uses the hmap structure, which maybe be accessed by main thread
>>>>>> and handler threads.
>>>>>>
>>>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>>>>>> That I see the reason and it is fine.
>>>>>>
>>>>>> The part that I don't understand is moving the cmap_remove() call before the
>>>>>> RCU sync.
>>>>>>
>>>>>> As far as I know, the CMAP type does not require that to safely operate.
>>>>>> The writer thread is allowed to call cmap_remove() while a reader is iterating on
>>>>>> the CMAP to find a node. The only precaution needed is that actual destruction
>>>>>> of the node (freeing) is deferred, which it is.
>>>>>>
>>>>>> So I don't see the reason to move cmap_remove() before the RCU
>>>>>> synchronization, instead of keeping it as it is now. Could you please explain your
>>>>>> reasoning?
>>>>>
>>>>> I consider it is more reasonable for the upcall thread cannot find the ofproto
>>>>> when the ofproto will be deleted, no other special consideration.
>>>>>
>>>>> Thanks,
>>>>> Yunjian
>>>>
>>>> That might be an interesting change to clean up the 'execution context' of the ofproto destruction.
>>>>
>>>> But I think it is out of scope for this fix. It means changing the xlate object,
>>>> introduce coupling between the ofproto map and the xlate layer. This goes beyond
>>>> fixing undefined behavior due to concurrent accesses on the map.
>>>>
>>>
>>> Hi Gaëtan,
>>>
>>> I think the root cause of the crash this fix pretends to fix is not the
>>> concurrent access to the map (though that is also an issue, of course) but the
>>> upcall using an ofproto-dpif object after the main thread has run its
>>> destruct(). So I think we need to fix both.
>>>
>>> With regards to the way of fixing it, what do you think about removing
>>> all_ofproto_dpifs_by_uuid altogether and using xbridge_lookup_by_uuid() intead?
>>>
>>> Thanks
>>> -- 
>>> Adrián Moreno
>> 
>> Hi Adrián,
>> 
>> Thanks for explaining, it's clear now.
>> Your suggestion is interesting, it seems correct.
>> 
>> However, it means changing xcfg->xbridges into a cmap. It requires
>> xlate_xbridge_remove() itself to be RCU compatible. Although there is the
>> RCU synchronization during xlate_txn_commit(), there is still a window between
>> xlate_remove_ofproto() and xlate_txn_commit() during which a thread will lookup
>> into this CMAP and get an xbridge reference.
>> 
>
> I don't think we would need xcfg->xbridges to be a cmap. Concurrency of 
> the xcfg 
> object is handled by the xlate_txn_start() / xlate_txn_commit() 
> mechanism.
> The entire xcfg is copied to new_xcfg, modifications are performed in 
> next_cfg 
> and then xcfg is replaced. Copy and replacement of the entire structure 
> are rcu 
> protected.
>
> So, I think the hmap itself is safe to access and since 
> xlate_txn_commit() calls 
> ovsrcu_synchronize(), upcall threads won't have references to anything 
> in the 
> previous configuration.
>
> So, AFAICS, this suggestion is just removing code, not adding it.
>

I believe you are right, that's nice!

> However, there is a potentially important behavioral change: the xbridge hmap 
> uses the ofproto pointer as hash while the current map uses the uuid. So the 
> lookup will be slower. Hence my original thought of just adding 
> all_ofproto_dpifs_by_uuid to xcfg.

Why not use the ofproto uuid as key for the xbridges map otherwise?

>
>> Maybe it only means having to ovsrcu_postpone(free, xbridge) in xlate_xbridge_remove(),
>> but it seems dangerous to me to open an execution context to concurrency for an object
>> that was not initially written for it, in the context of a bug fix.
>> 
>> In any case, I understand the purpose of the whole patch now.
>> Maybe moving the cmap_remove() before xlate_txn_commit(), with a proper comment
>> fully justifying it, is better for a fix that is meant to be backported?
>> 
>
> I agree that we should look for a fix with low intrusiveness for 
> backporting. 
> However, for the master branch I believe we should prioritize 
> robustness. And 
> having an object (all_ofproto_dpifs_by_uuid) silently use the 
> synchronization 
> mechanism of another unrelated object (xcfg) seems to me as more error 
> prone. WDYT?

Sure, totally agree on both counts. I was just worried that the concurrency
requirement for xlate would be too heavy for a fix, but I had missed what you just explained.
Adrian Moreno Feb. 21, 2022, 10:49 a.m. UTC | #17
On 2/21/22 11:42, Gaëtan Rivet wrote:
> On Mon, Feb 21, 2022, at 08:44, Adrian Moreno wrote:
>> On 2/18/22 12:18, Gaëtan Rivet wrote:
>>> On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
>>>> On 2/17/22 16:15, Gaëtan Rivet wrote:
>>>>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>>>>>>> -----Original Message-----
>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>> Sent: Thursday, February 17, 2022 9:31 PM
>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>>>>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>
>>>>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>>>>>>>>> wangyunjian via dev
>>>>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>>>>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>>>>>>>>>> 贺
>>>>>>>>> 鹏
>>>>>>>>>> <xnhp0320@gmail.com>
>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>>
>>>>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>>>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>>>>
>>>>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>>>>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>>>>>>>>>>>>> thread maybe remove and free the "ofproto" at the same time. The
>>>>>>> "ofproto"
>>>>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>>>>>>>>>>>>>
>>>>>>>>>>>>> This patch fixes this by making the "ofproto" lookup RCU-safe
>>>>>>>>>>>>> by using cmap instead of hmap and moving remove "ofproto" call
>>>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I don't understand the point of moving the cmap_remove() call
>>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>>
>>>>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to avoid
>>>>>>>>>>> access to the ofproto from other thread through uuid map.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Yes the reason is clear.
>>>>>>>>>>
>>>>>>>>>> But my question is why is it needed? It seems that the ofproto
>>>>>>>>>> lifecycle was written with the assumption that it would still be
>>>>>>>>>> used while being
>>>>>>>>> destroyed.
>>>>>>>>>>
>>>>>>>>>> Can you explain why it needs to be changed?
>>>>>>>>>
>>>>>>>>> I didn't describe the problem clearly before. The main problem is
>>>>>>>>> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>>>>>>>>> variable uses the hmap structure, which maybe be accessed by main thread
>>>>>>> and handler threads.
>>>>>>>
>>>>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>>>>>>> That I see the reason and it is fine.
>>>>>>>
>>>>>>> The part that I don't understand is moving the cmap_remove() call before the
>>>>>>> RCU sync.
>>>>>>>
>>>>>>> As far as I know, the CMAP type does not require that to safely operate.
>>>>>>> The writer thread is allowed to call cmap_remove() while a reader is iterating on
>>>>>>> the CMAP to find a node. The only precaution needed is that actual destruction
>>>>>>> of the node (freeing) is deferred, which it is.
>>>>>>>
>>>>>>> So I don't see the reason to move cmap_remove() before the RCU
>>>>>>> synchronization, instead of keeping it as it is now. Could you please explain your
>>>>>>> reasoning?
>>>>>>
>>>>>> I consider it is more reasonable for the upcall thread cannot find the ofproto
>>>>>> when the ofproto will be deleted, no other special consideration.
>>>>>>
>>>>>> Thanks,
>>>>>> Yunjian
>>>>>
>>>>> That might be an interesting change to clean up the 'execution context' of the ofproto destruction.
>>>>>
>>>>> But I think it is out of scope for this fix. It means changing the xlate object,
>>>>> introduce coupling between the ofproto map and the xlate layer. This goes beyond
>>>>> fixing undefined behavior due to concurrent accesses on the map.
>>>>>
>>>>
>>>> Hi Gaëtan,
>>>>
>>>> I think the root cause of the crash this fix pretends to fix is not the
>>>> concurrent access to the map (though that is also an issue, of course) but the
>>>> upcall using an ofproto-dpif object after the main thread has run its
>>>> destruct(). So I think we need to fix both.
>>>>
>>>> With regards to the way of fixing it, what do you think about removing
>>>> all_ofproto_dpifs_by_uuid altogether and using xbridge_lookup_by_uuid() intead?
>>>>
>>>> Thanks
>>>> -- 
>>>> Adrián Moreno
>>>
>>> Hi Adrián,
>>>
>>> Thanks for explaining, it's clear now.
>>> Your suggestion is interesting, it seems correct.
>>>
>>> However, it means changing xcfg->xbridges into a cmap. It requires
>>> xlate_xbridge_remove() itself to be RCU compatible. Although there is the
>>> RCU synchronization during xlate_txn_commit(), there is still a window between
>>> xlate_remove_ofproto() and xlate_txn_commit() during which a thread will lookup
>>> into this CMAP and get an xbridge reference.
>>>
>>
>> I don't think we would need xcfg->xbridges to be a cmap. Concurrency of
>> the xcfg
>> object is handled by the xlate_txn_start() / xlate_txn_commit()
>> mechanism.
>> The entire xcfg is copied to new_xcfg, modifications are performed in
>> next_cfg
>> and then xcfg is replaced. Copy and replacement of the entire structure
>> are rcu
>> protected.
>>
>> So, I think the hmap itself is safe to access and since
>> xlate_txn_commit() calls
>> ovsrcu_synchronize(), upcall threads won't have references to anything
>> in the
>> previous configuration.
>>
>> So, AFAICS, this suggestion is just removing code, not adding it.
>>
> 
> I believe you are right, that's nice!
> 
>> However, there is a potentially important behavioral change: the xbridge hmap
>> uses the ofproto pointer as hash while the current map uses the uuid. So the
>> lookup will be slower. Hence my original thought of just adding
>> all_ofproto_dpifs_by_uuid to xcfg.
> 
> Why not use the ofproto uuid as key for the xbridges map otherwise?
> 

Yes, that should work without really making xbridge_lookup() by ofproto-dpif 
pointer slower.

>>
>>> Maybe it only means having to ovsrcu_postpone(free, xbridge) in xlate_xbridge_remove(),
>>> but it seems dangerous to me to open an execution context to concurrency for an object
>>> that was not initially written for it, in the context of a bug fix.
>>>
>>> In any case, I understand the purpose of the whole patch now.
>>> Maybe moving the cmap_remove() before xlate_txn_commit(), with a proper comment
>>> fully justifying it, is better for a fix that is meant to be backported?
>>>
>>
>> I agree that we should look for a fix with low intrusiveness for
>> backporting.
>> However, for the master branch I believe we should prioritize
>> robustness. And
>> having an object (all_ofproto_dpifs_by_uuid) silently use the
>> synchronization
>> mechanism of another unrelated object (xcfg) seems to me as more error
>> prone. WDYT?
> 
> Sure, totally agree on both counts. I was just worried that the concurrency
> requirement for xlate would be too heavy for a fix, but I had missed what you just explained.
>
Adrian Moreno Feb. 23, 2022, 12:05 p.m. UTC | #18
On 2/21/22 11:49, Adrian Moreno wrote:
> 
> 
> On 2/21/22 11:42, Gaëtan Rivet wrote:
>> On Mon, Feb 21, 2022, at 08:44, Adrian Moreno wrote:
>>> On 2/18/22 12:18, Gaëtan Rivet wrote:
>>>> On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
>>>>> On 2/17/22 16:15, Gaëtan Rivet wrote:
>>>>>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>>>>>>>> -----Original Message-----
>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>> Sent: Thursday, February 17, 2022 9:31 PM
>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>>>>>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>
>>>>>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>>>>>>>>>> wangyunjian via dev
>>>>>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>>>>>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong <dingxiaoxiong@huawei.com>;
>>>>>>>>>>> 贺
>>>>>>>>>> 鹏
>>>>>>>>>>> <xnhp0320@gmail.com>
>>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for 
>>>>>>>>>>> "ofproto".
>>>>>>>>>>>
>>>>>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>>>>>>>>>>>>> -----Original Message-----
>>>>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>>>>>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>>>>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>; <dev@openvswitch.org>
>>>>>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>>>>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>>>>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for 
>>>>>>>>>>>>> "ofproto".
>>>>>>>>>>>>>
>>>>>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>>>>>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>>>>>>>>>>>>>> thread maybe remove and free the "ofproto" at the same time. The
>>>>>>>> "ofproto"
>>>>>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> This patch fixes this by making the "ofproto" lookup RCU-safe
>>>>>>>>>>>>>> by using cmap instead of hmap and moving remove "ofproto" call
>>>>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> I don't understand the point of moving the cmap_remove() call
>>>>>>>>>>>>> before xlate_txn_commit().
>>>>>>>>>>>>
>>>>>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to avoid
>>>>>>>>>>>> access to the ofproto from other thread through uuid map.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Yes the reason is clear.
>>>>>>>>>>>
>>>>>>>>>>> But my question is why is it needed? It seems that the ofproto
>>>>>>>>>>> lifecycle was written with the assumption that it would still be
>>>>>>>>>>> used while being
>>>>>>>>>> destroyed.
>>>>>>>>>>>
>>>>>>>>>>> Can you explain why it needs to be changed?
>>>>>>>>>>
>>>>>>>>>> I didn't describe the problem clearly before. The main problem is
>>>>>>>>>> that hmap variable is not thread safe. The all_ofproto_dpifs_by_uuid
>>>>>>>>>> variable uses the hmap structure, which maybe be accessed by main thread
>>>>>>>> and handler threads.
>>>>>>>>
>>>>>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>>>>>>>> That I see the reason and it is fine.
>>>>>>>>
>>>>>>>> The part that I don't understand is moving the cmap_remove() call before 
>>>>>>>> the
>>>>>>>> RCU sync.
>>>>>>>>
>>>>>>>> As far as I know, the CMAP type does not require that to safely operate.
>>>>>>>> The writer thread is allowed to call cmap_remove() while a reader is 
>>>>>>>> iterating on
>>>>>>>> the CMAP to find a node. The only precaution needed is that actual 
>>>>>>>> destruction
>>>>>>>> of the node (freeing) is deferred, which it is.
>>>>>>>>
>>>>>>>> So I don't see the reason to move cmap_remove() before the RCU
>>>>>>>> synchronization, instead of keeping it as it is now. Could you please 
>>>>>>>> explain your
>>>>>>>> reasoning?
>>>>>>>
>>>>>>> I consider it is more reasonable for the upcall thread cannot find the 
>>>>>>> ofproto
>>>>>>> when the ofproto will be deleted, no other special consideration.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Yunjian
>>>>>>
>>>>>> That might be an interesting change to clean up the 'execution context' of 
>>>>>> the ofproto destruction.
>>>>>>
>>>>>> But I think it is out of scope for this fix. It means changing the xlate 
>>>>>> object,
>>>>>> introduce coupling between the ofproto map and the xlate layer. This goes 
>>>>>> beyond
>>>>>> fixing undefined behavior due to concurrent accesses on the map.
>>>>>>
>>>>>
>>>>> Hi Gaëtan,
>>>>>
>>>>> I think the root cause of the crash this fix pretends to fix is not the
>>>>> concurrent access to the map (though that is also an issue, of course) but the
>>>>> upcall using an ofproto-dpif object after the main thread has run its
>>>>> destruct(). So I think we need to fix both.
>>>>>
>>>>> With regards to the way of fixing it, what do you think about removing
>>>>> all_ofproto_dpifs_by_uuid altogether and using xbridge_lookup_by_uuid() 
>>>>> intead?
>>>>>
>>>>> Thanks
>>>>> -- 
>>>>> Adrián Moreno
>>>>
>>>> Hi Adrián,
>>>>
>>>> Thanks for explaining, it's clear now.
>>>> Your suggestion is interesting, it seems correct.
>>>>
>>>> However, it means changing xcfg->xbridges into a cmap. It requires
>>>> xlate_xbridge_remove() itself to be RCU compatible. Although there is the
>>>> RCU synchronization during xlate_txn_commit(), there is still a window between
>>>> xlate_remove_ofproto() and xlate_txn_commit() during which a thread will lookup
>>>> into this CMAP and get an xbridge reference.
>>>>
>>>
>>> I don't think we would need xcfg->xbridges to be a cmap. Concurrency of
>>> the xcfg
>>> object is handled by the xlate_txn_start() / xlate_txn_commit()
>>> mechanism.
>>> The entire xcfg is copied to new_xcfg, modifications are performed in
>>> next_cfg
>>> and then xcfg is replaced. Copy and replacement of the entire structure
>>> are rcu
>>> protected.
>>>
>>> So, I think the hmap itself is safe to access and since
>>> xlate_txn_commit() calls
>>> ovsrcu_synchronize(), upcall threads won't have references to anything
>>> in the
>>> previous configuration.
>>>
>>> So, AFAICS, this suggestion is just removing code, not adding it.
>>>
>>
>> I believe you are right, that's nice!
>>
>>> However, there is a potentially important behavioral change: the xbridge hmap
>>> uses the ofproto pointer as hash while the current map uses the uuid. So the
>>> lookup will be slower. Hence my original thought of just adding
>>> all_ofproto_dpifs_by_uuid to xcfg.
>>
>> Why not use the ofproto uuid as key for the xbridges map otherwise?
>>
> 
> Yes, that should work without really making xbridge_lookup() by ofproto-dpif 
> pointer slower.
> 
>>>
>>>> Maybe it only means having to ovsrcu_postpone(free, xbridge) in 
>>>> xlate_xbridge_remove(),
>>>> but it seems dangerous to me to open an execution context to concurrency for 
>>>> an object
>>>> that was not initially written for it, in the context of a bug fix.
>>>>
>>>> In any case, I understand the purpose of the whole patch now.
>>>> Maybe moving the cmap_remove() before xlate_txn_commit(), with a proper comment
>>>> fully justifying it, is better for a fix that is meant to be backported?
>>>>
>>>
>>> I agree that we should look for a fix with low intrusiveness for
>>> backporting.
>>> However, for the master branch I believe we should prioritize
>>> robustness. And
>>> having an object (all_ofproto_dpifs_by_uuid) silently use the
>>> synchronization
>>> mechanism of another unrelated object (xcfg) seems to me as more error
>>> prone. WDYT?
>>
>> Sure, totally agree on both counts. I was just worried that the concurrency
>> requirement for xlate would be too heavy for a fix, but I had missed what you 
>> just explained.
>>
> 


Hi Yunjian,

What do you think? Are you planning in sending v2?

Thanks,
wangyunjian Feb. 23, 2022, 2:29 p.m. UTC | #19
> -----Original Message-----
> From: Adrian Moreno [mailto:amorenoz@redhat.com]
> Sent: Wednesday, February 23, 2022 8:06 PM
> To: Gaëtan Rivet <grive@u256.net>; wangyunjian <wangyunjian@huawei.com>;
> dev@openvswitch.org; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
> <xnhp0320@gmail.com>
> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
> 
> 
> 
> On 2/21/22 11:49, Adrian Moreno wrote:
> >
> >
> > On 2/21/22 11:42, Gaëtan Rivet wrote:
> >> On Mon, Feb 21, 2022, at 08:44, Adrian Moreno wrote:
> >>> On 2/18/22 12:18, Gaëtan Rivet wrote:
> >>>> On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
> >>>>> On 2/17/22 16:15, Gaëtan Rivet wrote:
> >>>>>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
> >>>>>>>> -----Original Message-----
> >>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
> >>>>>>>> Sent: Thursday, February 17, 2022 9:31 PM
> >>>>>>>> To: wangyunjian <wangyunjian@huawei.com>;
> <dev@openvswitch.org>
> >>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺
> 鹏
> >>>>>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
> >>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> >>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for
> "ofproto".
> >>>>>>>>
> >>>>>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
> >>>>>>>>>> -----Original Message-----
> >>>>>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf
> >>>>>>>>>> Of wangyunjian via dev
> >>>>>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
> >>>>>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
> >>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
> >>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> >>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for
> "ofproto".
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>>> -----Original Message-----
> >>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
> >>>>>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
> >>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>;
> >>>>>>>>>>> <dev@openvswitch.org> <dev@openvswitch.org>; Ilya Maximets
> >>>>>>>>>>> <i.maximets@ovn.org>
> >>>>>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong
> >>>>>>>>>>> <dingxiaoxiong@huawei.com>;
> >>>>>>>>>>> 贺
> >>>>>>>>>> 鹏
> >>>>>>>>>>> <xnhp0320@gmail.com>
> >>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free
> >>>>>>>>>>> for "ofproto".
> >>>>>>>>>>>
> >>>>>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
> >>>>>>>>>>>>> -----Original Message-----
> >>>>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
> >>>>>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
> >>>>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>;
> >>>>>>>>>>>>> <dev@openvswitch.org> <dev@openvswitch.org>; Ilya
> Maximets
> >>>>>>>>>>>>> <i.maximets@ovn.org>
> >>>>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
> >>>>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free
> >>>>>>>>>>>>> for "ofproto".
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
> >>>>>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
> >>>>>>>>>>>>>> thread maybe remove and free the "ofproto" at the same
> >>>>>>>>>>>>>> time. The
> >>>>>>>> "ofproto"
> >>>>>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> This patch fixes this by making the "ofproto" lookup
> >>>>>>>>>>>>>> RCU-safe by using cmap instead of hmap and moving remove
> >>>>>>>>>>>>>> "ofproto" call before xlate_txn_commit().
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I don't understand the point of moving the cmap_remove()
> >>>>>>>>>>>>> call before xlate_txn_commit().
> >>>>>>>>>>>>
> >>>>>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to
> >>>>>>>>>>>> avoid access to the ofproto from other thread through uuid map.
> >>>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> Yes the reason is clear.
> >>>>>>>>>>>
> >>>>>>>>>>> But my question is why is it needed? It seems that the
> >>>>>>>>>>> ofproto lifecycle was written with the assumption that it
> >>>>>>>>>>> would still be used while being
> >>>>>>>>>> destroyed.
> >>>>>>>>>>>
> >>>>>>>>>>> Can you explain why it needs to be changed?
> >>>>>>>>>>
> >>>>>>>>>> I didn't describe the problem clearly before. The main
> >>>>>>>>>> problem is that hmap variable is not thread safe. The
> >>>>>>>>>> all_ofproto_dpifs_by_uuid variable uses the hmap structure,
> >>>>>>>>>> which maybe be accessed by main thread
> >>>>>>>> and handler threads.
> >>>>>>>>
> >>>>>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
> >>>>>>>> That I see the reason and it is fine.
> >>>>>>>>
> >>>>>>>> The part that I don't understand is moving the cmap_remove()
> >>>>>>>> call before the RCU sync.
> >>>>>>>>
> >>>>>>>> As far as I know, the CMAP type does not require that to safely
> operate.
> >>>>>>>> The writer thread is allowed to call cmap_remove() while a
> >>>>>>>> reader is iterating on the CMAP to find a node. The only
> >>>>>>>> precaution needed is that actual destruction of the node
> >>>>>>>> (freeing) is deferred, which it is.
> >>>>>>>>
> >>>>>>>> So I don't see the reason to move cmap_remove() before the RCU
> >>>>>>>> synchronization, instead of keeping it as it is now. Could you
> >>>>>>>> please explain your reasoning?
> >>>>>>>
> >>>>>>> I consider it is more reasonable for the upcall thread cannot
> >>>>>>> find the ofproto when the ofproto will be deleted, no other
> >>>>>>> special consideration.
> >>>>>>>
> >>>>>>> Thanks,
> >>>>>>> Yunjian
> >>>>>>
> >>>>>> That might be an interesting change to clean up the 'execution
> >>>>>> context' of the ofproto destruction.
> >>>>>>
> >>>>>> But I think it is out of scope for this fix. It means changing
> >>>>>> the xlate object, introduce coupling between the ofproto map and
> >>>>>> the xlate layer. This goes beyond fixing undefined behavior due
> >>>>>> to concurrent accesses on the map.
> >>>>>>
> >>>>>
> >>>>> Hi Gaëtan,
> >>>>>
> >>>>> I think the root cause of the crash this fix pretends to fix is not the
> >>>>> concurrent access to the map (though that is also an issue, of course) but
> the
> >>>>> upcall using an ofproto-dpif object after the main thread has run its
> >>>>> destruct(). So I think we need to fix both.
> >>>>>
> >>>>> With regards to the way of fixing it, what do you think about removing
> >>>>> all_ofproto_dpifs_by_uuid altogether and using
> xbridge_lookup_by_uuid()
> >>>>> intead?
> >>>>>
> >>>>> Thanks
> >>>>> --
> >>>>> Adrián Moreno
> >>>>
> >>>> Hi Adrián,
> >>>>
> >>>> Thanks for explaining, it's clear now.
> >>>> Your suggestion is interesting, it seems correct.
> >>>>
> >>>> However, it means changing xcfg->xbridges into a cmap. It requires
> >>>> xlate_xbridge_remove() itself to be RCU compatible. Although there is the
> >>>> RCU synchronization during xlate_txn_commit(), there is still a window
> between
> >>>> xlate_remove_ofproto() and xlate_txn_commit() during which a thread will
> lookup
> >>>> into this CMAP and get an xbridge reference.
> >>>>
> >>>
> >>> I don't think we would need xcfg->xbridges to be a cmap. Concurrency of
> >>> the xcfg
> >>> object is handled by the xlate_txn_start() / xlate_txn_commit()
> >>> mechanism.
> >>> The entire xcfg is copied to new_xcfg, modifications are performed in
> >>> next_cfg
> >>> and then xcfg is replaced. Copy and replacement of the entire structure
> >>> are rcu
> >>> protected.
> >>>
> >>> So, I think the hmap itself is safe to access and since
> >>> xlate_txn_commit() calls
> >>> ovsrcu_synchronize(), upcall threads won't have references to anything
> >>> in the
> >>> previous configuration.
> >>>
> >>> So, AFAICS, this suggestion is just removing code, not adding it.
> >>>
> >>
> >> I believe you are right, that's nice!
> >>
> >>> However, there is a potentially important behavioral change: the xbridge
> hmap
> >>> uses the ofproto pointer as hash while the current map uses the uuid. So the
> >>> lookup will be slower. Hence my original thought of just adding
> >>> all_ofproto_dpifs_by_uuid to xcfg.
> >>
> >> Why not use the ofproto uuid as key for the xbridges map otherwise?
> >>
> >
> > Yes, that should work without really making xbridge_lookup() by ofproto-dpif
> > pointer slower.
> >
> >>>
> >>>> Maybe it only means having to ovsrcu_postpone(free, xbridge) in
> >>>> xlate_xbridge_remove(),
> >>>> but it seems dangerous to me to open an execution context to concurrency
> for
> >>>> an object
> >>>> that was not initially written for it, in the context of a bug fix.
> >>>>
> >>>> In any case, I understand the purpose of the whole patch now.
> >>>> Maybe moving the cmap_remove() before xlate_txn_commit(), with a
> proper comment
> >>>> fully justifying it, is better for a fix that is meant to be backported?
> >>>>
> >>>
> >>> I agree that we should look for a fix with low intrusiveness for
> >>> backporting.
> >>> However, for the master branch I believe we should prioritize
> >>> robustness. And
> >>> having an object (all_ofproto_dpifs_by_uuid) silently use the
> >>> synchronization
> >>> mechanism of another unrelated object (xcfg) seems to me as more error
> >>> prone. WDYT?
> >>
> >> Sure, totally agree on both counts. I was just worried that the concurrency
> >> requirement for xlate would be too heavy for a fix, but I had missed what you
> >> just explained.
> >>
> >
> 
> 
> Hi Yunjian,
> 
> What do you think? Are you planning in sending v2?

I agree with you. But I don't have time to update this patch at present.
Can you fix it?

Thanks

> 
> Thanks,
> --
> Adrián Moreno
Gaetan Rivet Feb. 23, 2022, 5:03 p.m. UTC | #20
On Wed, Feb 23, 2022, at 15:29, wangyunjian wrote:
>> -----Original Message-----
>> From: Adrian Moreno [mailto:amorenoz@redhat.com]
>> Sent: Wednesday, February 23, 2022 8:06 PM
>> To: Gaëtan Rivet <grive@u256.net>; wangyunjian <wangyunjian@huawei.com>;
>> dev@openvswitch.org; Ilya Maximets <i.maximets@ovn.org>; 贺鹏
>> <xnhp0320@gmail.com>
>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for "ofproto".
>> 
>> 
>> 
>> On 2/21/22 11:49, Adrian Moreno wrote:
>> >
>> >
>> > On 2/21/22 11:42, Gaëtan Rivet wrote:
>> >> On Mon, Feb 21, 2022, at 08:44, Adrian Moreno wrote:
>> >>> On 2/18/22 12:18, Gaëtan Rivet wrote:
>> >>>> On Thu, Feb 17, 2022, at 16:40, Adrian Moreno wrote:
>> >>>>> On 2/17/22 16:15, Gaëtan Rivet wrote:
>> >>>>>> On Thu, Feb 17, 2022, at 15:08, wangyunjian wrote:
>> >>>>>>>> -----Original Message-----
>> >>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>> >>>>>>>> Sent: Thursday, February 17, 2022 9:31 PM
>> >>>>>>>> To: wangyunjian <wangyunjian@huawei.com>;
>> <dev@openvswitch.org>
>> >>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>; 贺
>> 鹏
>> >>>>>>>> <xnhp0320@gmail.com>; amorenoz@redhat.com
>> >>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> >>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for
>> "ofproto".
>> >>>>>>>>
>> >>>>>>>> On Thu, Feb 17, 2022, at 08:29, wangyunjian wrote:
>> >>>>>>>>>> -----Original Message-----
>> >>>>>>>>>> From: dev [mailto:ovs-dev-bounces@openvswitch.org] On Behalf
>> >>>>>>>>>> Of wangyunjian via dev
>> >>>>>>>>>> Sent: Thursday, February 17, 2022 11:27 AM
>> >>>>>>>>>> To: Gaëtan Rivet <grive@u256.net>; <dev@openvswitch.org>
>> >>>>>>>>>> <dev@openvswitch.org>; Ilya Maximets <i.maximets@ovn.org>
>> >>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> >>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free for
>> "ofproto".
>> >>>>>>>>>>
>> >>>>>>>>>>
>> >>>>>>>>>>
>> >>>>>>>>>>> -----Original Message-----
>> >>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>> >>>>>>>>>>> Sent: Thursday, February 17, 2022 1:29 AM
>> >>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>;
>> >>>>>>>>>>> <dev@openvswitch.org> <dev@openvswitch.org>; Ilya Maximets
>> >>>>>>>>>>> <i.maximets@ovn.org>
>> >>>>>>>>>>> Cc: amorenoz@redhat.com; dingxiaoxiong
>> >>>>>>>>>>> <dingxiaoxiong@huawei.com>;
>> >>>>>>>>>>> 贺
>> >>>>>>>>>> 鹏
>> >>>>>>>>>>> <xnhp0320@gmail.com>
>> >>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free
>> >>>>>>>>>>> for "ofproto".
>> >>>>>>>>>>>
>> >>>>>>>>>>> On Wed, Feb 16, 2022, at 14:24, wangyunjian wrote:
>> >>>>>>>>>>>>> -----Original Message-----
>> >>>>>>>>>>>>> From: Gaëtan Rivet [mailto:grive@u256.net]
>> >>>>>>>>>>>>> Sent: Wednesday, February 16, 2022 7:34 PM
>> >>>>>>>>>>>>> To: wangyunjian <wangyunjian@huawei.com>;
>> >>>>>>>>>>>>> <dev@openvswitch.org> <dev@openvswitch.org>; Ilya
>> Maximets
>> >>>>>>>>>>>>> <i.maximets@ovn.org>
>> >>>>>>>>>>>>> Cc: dingxiaoxiong <dingxiaoxiong@huawei.com>
>> >>>>>>>>>>>>> Subject: Re: [ovs-dev] [PATCH] ofproto: fix use-after-free
>> >>>>>>>>>>>>> for "ofproto".
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> On Fri, Dec 3, 2021, at 12:25, Yunjian Wang via dev wrote:
>> >>>>>>>>>>>>>> When handler threads lookup a "ofproto" and use it, main
>> >>>>>>>>>>>>>> thread maybe remove and free the "ofproto" at the same
>> >>>>>>>>>>>>>> time. The
>> >>>>>>>> "ofproto"
>> >>>>>>>>>>>>>> has not been protected well, which can lead to an OVS crash.
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>> This patch fixes this by making the "ofproto" lookup
>> >>>>>>>>>>>>>> RCU-safe by using cmap instead of hmap and moving remove
>> >>>>>>>>>>>>>> "ofproto" call before xlate_txn_commit().
>> >>>>>>>>>>>>>>
>> >>>>>>>>>>>>>
>> >>>>>>>>>>>>> I don't understand the point of moving the cmap_remove()
>> >>>>>>>>>>>>> call before xlate_txn_commit().
>> >>>>>>>>>>>>
>> >>>>>>>>>>>> To use of the rcu_synchronize in the xlate_txn_commit to
>> >>>>>>>>>>>> avoid access to the ofproto from other thread through uuid map.
>> >>>>>>>>>>>>
>> >>>>>>>>>>>
>> >>>>>>>>>>> Yes the reason is clear.
>> >>>>>>>>>>>
>> >>>>>>>>>>> But my question is why is it needed? It seems that the
>> >>>>>>>>>>> ofproto lifecycle was written with the assumption that it
>> >>>>>>>>>>> would still be used while being
>> >>>>>>>>>> destroyed.
>> >>>>>>>>>>>
>> >>>>>>>>>>> Can you explain why it needs to be changed?
>> >>>>>>>>>>
>> >>>>>>>>>> I didn't describe the problem clearly before. The main
>> >>>>>>>>>> problem is that hmap variable is not thread safe. The
>> >>>>>>>>>> all_ofproto_dpifs_by_uuid variable uses the hmap structure,
>> >>>>>>>>>> which maybe be accessed by main thread
>> >>>>>>>> and handler threads.
>> >>>>>>>>
>> >>>>>>>> I'm ok on the part switching to using CMAP to allow concurrent reads.
>> >>>>>>>> That I see the reason and it is fine.
>> >>>>>>>>
>> >>>>>>>> The part that I don't understand is moving the cmap_remove()
>> >>>>>>>> call before the RCU sync.
>> >>>>>>>>
>> >>>>>>>> As far as I know, the CMAP type does not require that to safely
>> operate.
>> >>>>>>>> The writer thread is allowed to call cmap_remove() while a
>> >>>>>>>> reader is iterating on the CMAP to find a node. The only
>> >>>>>>>> precaution needed is that actual destruction of the node
>> >>>>>>>> (freeing) is deferred, which it is.
>> >>>>>>>>
>> >>>>>>>> So I don't see the reason to move cmap_remove() before the RCU
>> >>>>>>>> synchronization, instead of keeping it as it is now. Could you
>> >>>>>>>> please explain your reasoning?
>> >>>>>>>
>> >>>>>>> I consider it is more reasonable for the upcall thread cannot
>> >>>>>>> find the ofproto when the ofproto will be deleted, no other
>> >>>>>>> special consideration.
>> >>>>>>>
>> >>>>>>> Thanks,
>> >>>>>>> Yunjian
>> >>>>>>
>> >>>>>> That might be an interesting change to clean up the 'execution
>> >>>>>> context' of the ofproto destruction.
>> >>>>>>
>> >>>>>> But I think it is out of scope for this fix. It means changing
>> >>>>>> the xlate object, introduce coupling between the ofproto map and
>> >>>>>> the xlate layer. This goes beyond fixing undefined behavior due
>> >>>>>> to concurrent accesses on the map.
>> >>>>>>
>> >>>>>
>> >>>>> Hi Gaëtan,
>> >>>>>
>> >>>>> I think the root cause of the crash this fix pretends to fix is not the
>> >>>>> concurrent access to the map (though that is also an issue, of course) but
>> the
>> >>>>> upcall using an ofproto-dpif object after the main thread has run its
>> >>>>> destruct(). So I think we need to fix both.
>> >>>>>
>> >>>>> With regards to the way of fixing it, what do you think about removing
>> >>>>> all_ofproto_dpifs_by_uuid altogether and using
>> xbridge_lookup_by_uuid()
>> >>>>> intead?
>> >>>>>
>> >>>>> Thanks
>> >>>>> --
>> >>>>> Adrián Moreno
>> >>>>
>> >>>> Hi Adrián,
>> >>>>
>> >>>> Thanks for explaining, it's clear now.
>> >>>> Your suggestion is interesting, it seems correct.
>> >>>>
>> >>>> However, it means changing xcfg->xbridges into a cmap. It requires
>> >>>> xlate_xbridge_remove() itself to be RCU compatible. Although there is the
>> >>>> RCU synchronization during xlate_txn_commit(), there is still a window
>> between
>> >>>> xlate_remove_ofproto() and xlate_txn_commit() during which a thread will
>> lookup
>> >>>> into this CMAP and get an xbridge reference.
>> >>>>
>> >>>
>> >>> I don't think we would need xcfg->xbridges to be a cmap. Concurrency of
>> >>> the xcfg
>> >>> object is handled by the xlate_txn_start() / xlate_txn_commit()
>> >>> mechanism.
>> >>> The entire xcfg is copied to new_xcfg, modifications are performed in
>> >>> next_cfg
>> >>> and then xcfg is replaced. Copy and replacement of the entire structure
>> >>> are rcu
>> >>> protected.
>> >>>
>> >>> So, I think the hmap itself is safe to access and since
>> >>> xlate_txn_commit() calls
>> >>> ovsrcu_synchronize(), upcall threads won't have references to anything
>> >>> in the
>> >>> previous configuration.
>> >>>
>> >>> So, AFAICS, this suggestion is just removing code, not adding it.
>> >>>
>> >>
>> >> I believe you are right, that's nice!
>> >>
>> >>> However, there is a potentially important behavioral change: the xbridge
>> hmap
>> >>> uses the ofproto pointer as hash while the current map uses the uuid. So the
>> >>> lookup will be slower. Hence my original thought of just adding
>> >>> all_ofproto_dpifs_by_uuid to xcfg.
>> >>
>> >> Why not use the ofproto uuid as key for the xbridges map otherwise?
>> >>
>> >
>> > Yes, that should work without really making xbridge_lookup() by ofproto-dpif
>> > pointer slower.
>> >
>> >>>
>> >>>> Maybe it only means having to ovsrcu_postpone(free, xbridge) in
>> >>>> xlate_xbridge_remove(),
>> >>>> but it seems dangerous to me to open an execution context to concurrency
>> for
>> >>>> an object
>> >>>> that was not initially written for it, in the context of a bug fix.
>> >>>>
>> >>>> In any case, I understand the purpose of the whole patch now.
>> >>>> Maybe moving the cmap_remove() before xlate_txn_commit(), with a
>> proper comment
>> >>>> fully justifying it, is better for a fix that is meant to be backported?
>> >>>>
>> >>>
>> >>> I agree that we should look for a fix with low intrusiveness for
>> >>> backporting.
>> >>> However, for the master branch I believe we should prioritize
>> >>> robustness. And
>> >>> having an object (all_ofproto_dpifs_by_uuid) silently use the
>> >>> synchronization
>> >>> mechanism of another unrelated object (xcfg) seems to me as more error
>> >>> prone. WDYT?
>> >>
>> >> Sure, totally agree on both counts. I was just worried that the concurrency
>> >> requirement for xlate would be too heavy for a fix, but I had missed what you
>> >> just explained.
>> >>
>> >
>> 
>> 
>> Hi Yunjian,
>> 
>> What do you think? Are you planning in sending v2?
>
> I agree with you. But I don't have time to update this patch at present.
> Can you fix it?
>
> Thanks
>
>> 
>> Thanks,
>> --
>> Adrián Moreno

Hi all,

I have a version written, I will submit it after running some tests.
diff mbox series

Patch

diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index cba49a99e..aa8d32f81 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -216,8 +216,7 @@  static struct hmap all_ofproto_dpifs_by_name =
                           HMAP_INITIALIZER(&all_ofproto_dpifs_by_name);
 
 /* All existing ofproto_dpif instances, indexed by ->uuid. */
-static struct hmap all_ofproto_dpifs_by_uuid =
-                          HMAP_INITIALIZER(&all_ofproto_dpifs_by_uuid);
+static struct cmap all_ofproto_dpifs_by_uuid = CMAP_INITIALIZER;
 
 static bool ofproto_use_tnl_push_pop = true;
 static void ofproto_unixctl_init(void);
@@ -1682,7 +1681,7 @@  construct(struct ofproto *ofproto_)
     hmap_insert(&all_ofproto_dpifs_by_name,
                 &ofproto->all_ofproto_dpifs_by_name_node,
                 hash_string(ofproto->up.name, 0));
-    hmap_insert(&all_ofproto_dpifs_by_uuid,
+    cmap_insert(&all_ofproto_dpifs_by_uuid,
                 &ofproto->all_ofproto_dpifs_by_uuid_node,
                 uuid_hash(&ofproto->uuid));
     memset(&ofproto->stats, 0, sizeof ofproto->stats);
@@ -1778,12 +1777,13 @@  destruct(struct ofproto *ofproto_, bool del)
     ofproto->backer->need_revalidate = REV_RECONFIGURE;
     xlate_txn_start();
     xlate_remove_ofproto(ofproto);
+    cmap_remove(&all_ofproto_dpifs_by_uuid,
+                &ofproto->all_ofproto_dpifs_by_uuid_node,
+                uuid_hash(&ofproto->uuid));
     xlate_txn_commit();
 
     hmap_remove(&all_ofproto_dpifs_by_name,
                 &ofproto->all_ofproto_dpifs_by_name_node);
-    hmap_remove(&all_ofproto_dpifs_by_uuid,
-                &ofproto->all_ofproto_dpifs_by_uuid_node);
 
     OFPROTO_FOR_EACH_TABLE (table, &ofproto->up) {
         CLS_FOR_EACH (rule, up.cr, &table->cls) {
@@ -5781,7 +5781,7 @@  ofproto_dpif_lookup_by_uuid(const struct uuid *uuid)
 {
     struct ofproto_dpif *ofproto;
 
-    HMAP_FOR_EACH_WITH_HASH (ofproto, all_ofproto_dpifs_by_uuid_node,
+    CMAP_FOR_EACH_WITH_HASH (ofproto, all_ofproto_dpifs_by_uuid_node,
                              uuid_hash(uuid), &all_ofproto_dpifs_by_uuid) {
         if (uuid_equals(&ofproto->uuid, uuid)) {
             return ofproto;
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
index 191cfcb0d..fba03f2cc 100644
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -295,7 +295,7 @@  struct ofproto_dpif {
     struct hmap_node all_ofproto_dpifs_by_name_node;
 
     /* In 'all_ofproto_dpifs_by_uuid'. */
-    struct hmap_node all_ofproto_dpifs_by_uuid_node;
+    struct cmap_node all_ofproto_dpifs_by_uuid_node;
 
     struct ofproto up;
     struct dpif_backer *backer;