diff mbox series

[ovs-dev,v3,RESEND] classifier: Prevent tries vs n_tries race leading to NULL dereference.

Message ID 20200517230623.81272-1-eiichi.tsukata@nutanix.com
State Changes Requested
Delegated to: Ilya Maximets
Headers show
Series [ovs-dev,v3,RESEND] classifier: Prevent tries vs n_tries race leading to NULL dereference. | expand

Commit Message

Eiichi Tsukata May 17, 2020, 11:06 p.m. UTC
Currently classifier tries and n_tries can be updated not atomically,
there is a race condition which can lead to NULL dereference.
The race can happen when main thread updates a classifier tries and
n_tries in classifier_set_prefix_fields() and at the same time revalidator
or handler thread try to lookup them in classifier_lookup__(). Such race
can be triggered when user changes prefixes of flow_table.

Race(user changes flow_table prefixes: ip_dst,ip_src => none):

  [main thread]             [revalidator/handler thread]
  ===========================================================
                            /* cls->n_tries == 2 */
                            for (int i = 0; i < cls->n_tries; i++) {
  trie_init(cls, i, NULL);
  /* n_tries == 0 */
  cls->n_tries = n_tries;
                            /* cls->tries[i]->feild is NULL */
                            trie_ctx_init(&trie_ctx[i],&cls->tries[i]);
                            /* trie->field is NULL */
                            ctx->be32ofs = trie->field->flow_be32ofs;

To prevent the race, instead of re-introducing internal mutex
implemented in the commit fccd7c092e09 ("classifier: Remove internal
mutex."), this patch makes trie field RCU protected and checks it after
read.

Fixes: fccd7c092e09 ("classifier: Remove internal mutex.")
Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
---
 lib/classifier.c        | 37 ++++++++++++++++++++++---------------
 lib/classifier.h        |  6 ++++--
 tests/test-classifier.c |  5 +++--
 3 files changed, 29 insertions(+), 19 deletions(-)

Comments

Ilya Maximets May 26, 2020, 4:28 p.m. UTC | #1
On 5/18/20 1:06 AM, Eiichi Tsukata wrote:
> Currently classifier tries and n_tries can be updated not atomically,
> there is a race condition which can lead to NULL dereference.
> The race can happen when main thread updates a classifier tries and
> n_tries in classifier_set_prefix_fields() and at the same time revalidator
> or handler thread try to lookup them in classifier_lookup__(). Such race
> can be triggered when user changes prefixes of flow_table.
> 
> Race(user changes flow_table prefixes: ip_dst,ip_src => none):
> 
>   [main thread]             [revalidator/handler thread]
>   ===========================================================
>                             /* cls->n_tries == 2 */
>                             for (int i = 0; i < cls->n_tries; i++) {
>   trie_init(cls, i, NULL);
>   /* n_tries == 0 */
>   cls->n_tries = n_tries;
>                             /* cls->tries[i]->feild is NULL */
>                             trie_ctx_init(&trie_ctx[i],&cls->tries[i]);
>                             /* trie->field is NULL */
>                             ctx->be32ofs = trie->field->flow_be32ofs;
> 
> To prevent the race, instead of re-introducing internal mutex
> implemented in the commit fccd7c092e09 ("classifier: Remove internal
> mutex."), this patch makes trie field RCU protected and checks it after
> read.
> 
> Fixes: fccd7c092e09 ("classifier: Remove internal mutex.")
> Signed-off-by: Eiichi Tsukata <eiichi.tsukata@nutanix.com>
> ---

Hi. Sorry for reiews taking so long.
Few commnets inline.

Best regards, Ilya Maximets.

>  lib/classifier.c        | 37 ++++++++++++++++++++++---------------
>  lib/classifier.h        |  6 ++++--
>  tests/test-classifier.c |  5 +++--
>  3 files changed, 29 insertions(+), 19 deletions(-)
> 
> diff --git a/lib/classifier.c b/lib/classifier.c
> index 0fad953..10909a6 100644
> --- a/lib/classifier.c
> +++ b/lib/classifier.c
> @@ -393,7 +393,9 @@ classifier_set_prefix_fields(struct classifier *cls,
>          bitmap_set1(fields.bm, trie_fields[i]);
>  
>          new_fields[n_tries] = NULL;
> -        if (n_tries >= cls->n_tries || field != cls->tries[n_tries].field) {
> +        const struct mf_field *cls_field
> +            = ovsrcu_get(struct mf_field *, &cls->tries[n_tries].field);
> +        if (n_tries >= cls->n_tries || field != cls_field) {
>              new_fields[n_tries] = field;
>              changed = true;
>          }
> @@ -454,7 +456,7 @@ trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field)
>      } else {
>          ovsrcu_set_hidden(&trie->root, NULL);
>      }
> -    trie->field = field;
> +    ovsrcu_set_hidden(&trie->field, field);

CONST_CAST required for the 'field' argument.  Otherwise build fails
due to discarded const qualifier.

>  
>      /* Add existing rules to the new trie. */
>      CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
> @@ -839,7 +841,6 @@ classifier_remove_assert(struct classifier *cls,
>  struct trie_ctx {
>      const struct cls_trie *trie;
>      bool lookup_done;        /* Status of the lookup. */
> -    uint8_t be32ofs;         /* U32 offset of the field in question. */
>      unsigned int maskbits;   /* Prefix length needed to avoid false matches. */
>      union trie_prefix match_plens;  /* Bitmask of prefix lengths with possible
>                                       * matches. */
> @@ -849,7 +850,6 @@ static void
>  trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie)
>  {
>      ctx->trie = trie;
> -    ctx->be32ofs = trie->field->flow_be32ofs;
>      ctx->lookup_done = false;
>  }
>  
> @@ -1531,8 +1531,10 @@ insert_subtable(struct classifier *cls, const struct minimask *mask)
>      *CONST_CAST(uint8_t *, &subtable->n_indices) = index;
>  
>      for (i = 0; i < cls->n_tries; i++) {
> -        subtable->trie_plen[i] = minimask_get_prefix_len(mask,
> -                                                         cls->tries[i].field);
> +        const struct mf_field *field
> +            = ovsrcu_get(struct mf_field *, &cls->tries[i].field);
> +        subtable->trie_plen[i]
> +            = field ? minimask_get_prefix_len(mask, field) : 0;
>      }
>  
>      /* Ports trie. */
> @@ -1577,8 +1579,10 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
>      for (j = 0; j < n_tries; j++) {
>          /* Is the trie field relevant for this subtable, and
>             is the trie field within the current range of fields? */
> -        if (field_plen[j] &&
> -            flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) {
> +        const struct mf_field *ctx_field
> +            = ovsrcu_get(struct mf_field *, &trie_ctx[j].trie->field);
> +        if (field_plen[j] && ctx_field &&


Looking at this part one more time I see that you didn't actually change
the order of reads (like it was in v1).  trie->filed is still read before
checking the field_plen.

RCU protects from the NULL pointer dereference here, so it should be fine,
but the code remains logically incorrect in terms that writer still thinks
that order of reads is opposite.

Not sure if we need to fix that in this patch, though.  What do you think?


> +            flowmap_is_set(&range_map, ctx_field->flow_be32ofs / 2)) {
>              struct trie_ctx *ctx = &trie_ctx[j];
>  
>              /* On-demand trie lookup. */
> @@ -1601,14 +1605,16 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
>                   * than this subtable would otherwise. */
>                  if (ctx->maskbits <= field_plen[j]) {
>                      /* Unwildcard the bits and skip the rest. */
> -                    mask_set_prefix_bits(wc, ctx->be32ofs, ctx->maskbits);
> +                    mask_set_prefix_bits(wc, ctx_field->flow_be32ofs,
> +                                         ctx->maskbits);
>                      /* Note: Prerequisite already unwildcarded, as the only
>                       * prerequisite of the supported trie lookup fields is
>                       * the ethertype, which is always unwildcarded. */
>                      return true;
>                  }
>                  /* Can skip if the field is already unwildcarded. */
> -                if (mask_prefix_bits_set(wc, ctx->be32ofs, ctx->maskbits)) {
> +                if (mask_prefix_bits_set(wc, ctx_field->flow_be32ofs,
> +                                         ctx->maskbits)) {
>                      return true;
>                  }
>              }
> @@ -2001,12 +2007,12 @@ static unsigned int
>  trie_lookup(const struct cls_trie *trie, const struct flow *flow,
>              union trie_prefix *plens)
>  {
> -    const struct mf_field *mf = trie->field;
> +    const struct mf_field *mf = ovsrcu_get(struct mf_field *, &trie->field);
>  
>      /* Check that current flow matches the prerequisites for the trie
>       * field.  Some match fields are used for multiple purposes, so we
>       * must check that the trie is relevant for this flow. */
> -    if (mf_are_prereqs_ok(mf, flow, NULL)) {
> +    if (mf && mf_are_prereqs_ok(mf, flow, NULL)) {
>          return trie_lookup_value(&trie->root,
>                                   &((ovs_be32 *)flow)[mf->flow_be32ofs],
>                                   &plens->be32, mf->n_bits);
> @@ -2053,8 +2059,9 @@ minimask_get_prefix_len(const struct minimask *minimask,
>   * happened to be zeros.
>   */
>  static const ovs_be32 *
> -minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
> +minimatch_get_prefix(const struct minimatch *match, rcu_field_ptr *field)
>  {
> +    struct mf_field *mf = ovsrcu_get_protected(struct mf_field *, field);
>      size_t u64_ofs = mf->flow_be32ofs / 2;
>  
>      return (OVS_FORCE const ovs_be32 *)miniflow_get__(match->flow, u64_ofs)
> @@ -2068,7 +2075,7 @@ static void
>  trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
>  {
>      trie_insert_prefix(&trie->root,
> -                       minimatch_get_prefix(&rule->match, trie->field), mlen);
> +                       minimatch_get_prefix(&rule->match, &trie->field), mlen);
>  }
>  
>  static void
> @@ -2123,7 +2130,7 @@ static void
>  trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
>  {
>      trie_remove_prefix(&trie->root,
> -                       minimatch_get_prefix(&rule->match, trie->field), mlen);
> +                       minimatch_get_prefix(&rule->match, &trie->field), mlen);
>  }
>  
>  /* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
> diff --git a/lib/classifier.h b/lib/classifier.h
> index d1bd4aa..f646a8f 100644
> --- a/lib/classifier.h
> +++ b/lib/classifier.h
> @@ -314,13 +314,15 @@ extern "C" {
>  struct cls_subtable;
>  struct cls_match;
>  
> +struct mf_field;
> +typedef OVSRCU_TYPE(struct mf_field *) rcu_field_ptr;
>  struct trie_node;
>  typedef OVSRCU_TYPE(struct trie_node *) rcu_trie_ptr;
>  
>  /* Prefix trie for a 'field' */
>  struct cls_trie {
> -    const struct mf_field *field; /* Trie field, or NULL. */
> -    rcu_trie_ptr root;            /* NULL if none. */
> +    rcu_field_ptr field;   /* Trie field, or NULL. */
> +    rcu_trie_ptr root;     /* NULL if none. */
>  };
>  
>  enum {
> diff --git a/tests/test-classifier.c b/tests/test-classifier.c
> index 6d53d01..2d98fad 100644
> --- a/tests/test-classifier.c
> +++ b/tests/test-classifier.c
> @@ -512,8 +512,9 @@ verify_tries(struct classifier *cls)
>      int i;
>  
>      for (i = 0; i < cls->n_tries; i++) {
> -        n_rules += trie_verify(&cls->tries[i].root, 0,
> -                               cls->tries[i].field->n_bits);
> +        const struct mf_field * cls_field
> +            = ovsrcu_get(struct mf_field *, &cls->tries[i].field);
> +        n_rules += trie_verify(&cls->tries[i].root, 0, cls_field->n_bits);
>      }
>      assert(n_rules <= cls->n_rules);
>  }
>
Eiichi Tsukata May 27, 2020, 1:37 a.m. UTC | #2
Hello Ilya

Thanks for comments!
Replied inline.

> On May 27, 2020, at 1:28, Ilya Maximets <i.maximets@ovn.org> wrote:
> 
> On 5/18/20 1:06 AM, Eiichi Tsukata wrote:
>> 
>> 
>> diff --git a/lib/classifier.c b/lib/classifier.c
>> index 0fad953..10909a6 100644
>> --- a/lib/classifier.c
>> +++ b/lib/classifier.c
>> @@ -393,7 +393,9 @@ classifier_set_prefix_fields(struct classifier *cls,
>>         bitmap_set1(fields.bm, trie_fields[i]);
>> 
>>         new_fields[n_tries] = NULL;
>> -        if (n_tries >= cls->n_tries || field != cls->tries[n_tries].field) {
>> +        const struct mf_field *cls_field
>> +            = ovsrcu_get(struct mf_field *, &cls->tries[n_tries].field);
>> +        if (n_tries >= cls->n_tries || field != cls_field) {
>>             new_fields[n_tries] = field;
>>             changed = true;
>>         }
>> @@ -454,7 +456,7 @@ trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field)
>>     } else {
>>         ovsrcu_set_hidden(&trie->root, NULL);
>>     }
>> -    trie->field = field;
>> +    ovsrcu_set_hidden(&trie->field, field);
> 
> CONST_CAST required for the 'field' argument.  Otherwise build fails
> due to discarded const qualifier.

Sorry, I’ll fix it in v4. Many thanks.

>> 
>>     /* Ports trie. */
>> @@ -1577,8 +1579,10 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
>>     for (j = 0; j < n_tries; j++) {
>>         /* Is the trie field relevant for this subtable, and
>>            is the trie field within the current range of fields? */
>> -        if (field_plen[j] &&
>> -            flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) {
>> +        const struct mf_field *ctx_field
>> +            = ovsrcu_get(struct mf_field *, &trie_ctx[j].trie->field);
>> +        if (field_plen[j] && ctx_field &&
> 
> 
> Looking at this part one more time I see that you didn't actually change
> the order of reads (like it was in v1).  trie->filed is still read before
> checking the field_plen.
> 
> RCU protects from the NULL pointer dereference here, so it should be fine,
> but the code remains logically incorrect in terms that writer still thinks
> that order of reads is opposite.
> 
> Not sure if we need to fix that in this patch, though.  What do you think?
> 
> 

I think I should fix it in this patch. The behavior should be fine but it can confuse us.
I’ll change the read order in v4.

Best Regards

Eiichi
diff mbox series

Patch

diff --git a/lib/classifier.c b/lib/classifier.c
index 0fad953..10909a6 100644
--- a/lib/classifier.c
+++ b/lib/classifier.c
@@ -393,7 +393,9 @@  classifier_set_prefix_fields(struct classifier *cls,
         bitmap_set1(fields.bm, trie_fields[i]);
 
         new_fields[n_tries] = NULL;
-        if (n_tries >= cls->n_tries || field != cls->tries[n_tries].field) {
+        const struct mf_field *cls_field
+            = ovsrcu_get(struct mf_field *, &cls->tries[n_tries].field);
+        if (n_tries >= cls->n_tries || field != cls_field) {
             new_fields[n_tries] = field;
             changed = true;
         }
@@ -454,7 +456,7 @@  trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field)
     } else {
         ovsrcu_set_hidden(&trie->root, NULL);
     }
-    trie->field = field;
+    ovsrcu_set_hidden(&trie->field, field);
 
     /* Add existing rules to the new trie. */
     CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {
@@ -839,7 +841,6 @@  classifier_remove_assert(struct classifier *cls,
 struct trie_ctx {
     const struct cls_trie *trie;
     bool lookup_done;        /* Status of the lookup. */
-    uint8_t be32ofs;         /* U32 offset of the field in question. */
     unsigned int maskbits;   /* Prefix length needed to avoid false matches. */
     union trie_prefix match_plens;  /* Bitmask of prefix lengths with possible
                                      * matches. */
@@ -849,7 +850,6 @@  static void
 trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie)
 {
     ctx->trie = trie;
-    ctx->be32ofs = trie->field->flow_be32ofs;
     ctx->lookup_done = false;
 }
 
@@ -1531,8 +1531,10 @@  insert_subtable(struct classifier *cls, const struct minimask *mask)
     *CONST_CAST(uint8_t *, &subtable->n_indices) = index;
 
     for (i = 0; i < cls->n_tries; i++) {
-        subtable->trie_plen[i] = minimask_get_prefix_len(mask,
-                                                         cls->tries[i].field);
+        const struct mf_field *field
+            = ovsrcu_get(struct mf_field *, &cls->tries[i].field);
+        subtable->trie_plen[i]
+            = field ? minimask_get_prefix_len(mask, field) : 0;
     }
 
     /* Ports trie. */
@@ -1577,8 +1579,10 @@  check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
     for (j = 0; j < n_tries; j++) {
         /* Is the trie field relevant for this subtable, and
            is the trie field within the current range of fields? */
-        if (field_plen[j] &&
-            flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) {
+        const struct mf_field *ctx_field
+            = ovsrcu_get(struct mf_field *, &trie_ctx[j].trie->field);
+        if (field_plen[j] && ctx_field &&
+            flowmap_is_set(&range_map, ctx_field->flow_be32ofs / 2)) {
             struct trie_ctx *ctx = &trie_ctx[j];
 
             /* On-demand trie lookup. */
@@ -1601,14 +1605,16 @@  check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
                  * than this subtable would otherwise. */
                 if (ctx->maskbits <= field_plen[j]) {
                     /* Unwildcard the bits and skip the rest. */
-                    mask_set_prefix_bits(wc, ctx->be32ofs, ctx->maskbits);
+                    mask_set_prefix_bits(wc, ctx_field->flow_be32ofs,
+                                         ctx->maskbits);
                     /* Note: Prerequisite already unwildcarded, as the only
                      * prerequisite of the supported trie lookup fields is
                      * the ethertype, which is always unwildcarded. */
                     return true;
                 }
                 /* Can skip if the field is already unwildcarded. */
-                if (mask_prefix_bits_set(wc, ctx->be32ofs, ctx->maskbits)) {
+                if (mask_prefix_bits_set(wc, ctx_field->flow_be32ofs,
+                                         ctx->maskbits)) {
                     return true;
                 }
             }
@@ -2001,12 +2007,12 @@  static unsigned int
 trie_lookup(const struct cls_trie *trie, const struct flow *flow,
             union trie_prefix *plens)
 {
-    const struct mf_field *mf = trie->field;
+    const struct mf_field *mf = ovsrcu_get(struct mf_field *, &trie->field);
 
     /* Check that current flow matches the prerequisites for the trie
      * field.  Some match fields are used for multiple purposes, so we
      * must check that the trie is relevant for this flow. */
-    if (mf_are_prereqs_ok(mf, flow, NULL)) {
+    if (mf && mf_are_prereqs_ok(mf, flow, NULL)) {
         return trie_lookup_value(&trie->root,
                                  &((ovs_be32 *)flow)[mf->flow_be32ofs],
                                  &plens->be32, mf->n_bits);
@@ -2053,8 +2059,9 @@  minimask_get_prefix_len(const struct minimask *minimask,
  * happened to be zeros.
  */
 static const ovs_be32 *
-minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
+minimatch_get_prefix(const struct minimatch *match, rcu_field_ptr *field)
 {
+    struct mf_field *mf = ovsrcu_get_protected(struct mf_field *, field);
     size_t u64_ofs = mf->flow_be32ofs / 2;
 
     return (OVS_FORCE const ovs_be32 *)miniflow_get__(match->flow, u64_ofs)
@@ -2068,7 +2075,7 @@  static void
 trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
 {
     trie_insert_prefix(&trie->root,
-                       minimatch_get_prefix(&rule->match, trie->field), mlen);
+                       minimatch_get_prefix(&rule->match, &trie->field), mlen);
 }
 
 static void
@@ -2123,7 +2130,7 @@  static void
 trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen)
 {
     trie_remove_prefix(&trie->root,
-                       minimatch_get_prefix(&rule->match, trie->field), mlen);
+                       minimatch_get_prefix(&rule->match, &trie->field), mlen);
 }
 
 /* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
diff --git a/lib/classifier.h b/lib/classifier.h
index d1bd4aa..f646a8f 100644
--- a/lib/classifier.h
+++ b/lib/classifier.h
@@ -314,13 +314,15 @@  extern "C" {
 struct cls_subtable;
 struct cls_match;
 
+struct mf_field;
+typedef OVSRCU_TYPE(struct mf_field *) rcu_field_ptr;
 struct trie_node;
 typedef OVSRCU_TYPE(struct trie_node *) rcu_trie_ptr;
 
 /* Prefix trie for a 'field' */
 struct cls_trie {
-    const struct mf_field *field; /* Trie field, or NULL. */
-    rcu_trie_ptr root;            /* NULL if none. */
+    rcu_field_ptr field;   /* Trie field, or NULL. */
+    rcu_trie_ptr root;     /* NULL if none. */
 };
 
 enum {
diff --git a/tests/test-classifier.c b/tests/test-classifier.c
index 6d53d01..2d98fad 100644
--- a/tests/test-classifier.c
+++ b/tests/test-classifier.c
@@ -512,8 +512,9 @@  verify_tries(struct classifier *cls)
     int i;
 
     for (i = 0; i < cls->n_tries; i++) {
-        n_rules += trie_verify(&cls->tries[i].root, 0,
-                               cls->tries[i].field->n_bits);
+        const struct mf_field * cls_field
+            = ovsrcu_get(struct mf_field *, &cls->tries[i].field);
+        n_rules += trie_verify(&cls->tries[i].root, 0, cls_field->n_bits);
     }
     assert(n_rules <= cls->n_rules);
 }