[ovs-dev,v1,1/2] conntrack: Fix race for NAT cleanup.
diff mbox series

Message ID 1552464506-35968-1-git-send-email-dlu998@gmail.com
State Superseded
Headers show
Series
  • [ovs-dev,v1,1/2] conntrack: Fix race for NAT cleanup.
Related show

Commit Message

Darrell Ball March 13, 2019, 8:08 a.m. UTC
Reference lists are not fully protected during cleanup of
NAT connections where the bucket lock is transiently not held during
list traversal.  This can lead to referencing freed memory during
cleaning from multiple contexts.  Fix this by protecting with
the existing 'cleanup' mutex in the missed cases where 'conn_clean()'
is called.  'conntrack_flush()' is converted to expiry list traversal
to support the proper bucket level protection with the 'cleanup' mutex.

Fixes: 286de2729955 ("dpdk: Userspace Datapath: Introduce NAT Support.")
Reported-by: solomon <liwei.solomon@gmail.com>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357056.html
Tested-by: solomon <liwei.solomon@gmail.com>
Signed-off-by: Darrell Ball <dlu998@gmail.com>
---
 lib/conntrack.c | 96 +++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 70 insertions(+), 26 deletions(-)

Comments

Darrell Ball March 13, 2019, 3:51 p.m. UTC | #1
V1 is superceded by the following change to V2.

-    if (OVS_LIKELY(force && ctx->reply && conn)) {
+    if (OVS_UNLIKELY(force && ctx->reply && conn)) {

On Wed, Mar 13, 2019 at 1:08 AM Darrell Ball <dlu998@gmail.com> wrote:

> Reference lists are not fully protected during cleanup of
> NAT connections where the bucket lock is transiently not held during
> list traversal.  This can lead to referencing freed memory during
> cleaning from multiple contexts.  Fix this by protecting with
> the existing 'cleanup' mutex in the missed cases where 'conn_clean()'
> is called.  'conntrack_flush()' is converted to expiry list traversal
> to support the proper bucket level protection with the 'cleanup' mutex.
>
> Fixes: 286de2729955 ("dpdk: Userspace Datapath: Introduce NAT Support.")
> Reported-by: solomon <liwei.solomon@gmail.com>
> Reported-at:
> https://mail.openvswitch.org/pipermail/ovs-dev/2019-March/357056.html
> Tested-by: solomon <liwei.solomon@gmail.com>
> Signed-off-by: Darrell Ball <dlu998@gmail.com>
> ---
>  lib/conntrack.c | 96
> +++++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 70 insertions(+), 26 deletions(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 691782c..40a6021 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -753,6 +753,24 @@ conn_lookup(struct conntrack *ct, const struct
> conn_key *key, long long now)
>      return ctx.conn;
>  }
>
> +/* This function is called with the bucket lock held. */
> +static struct conn *
> +conn_lookup_any(const struct conn_key *key,
> +                const struct conntrack_bucket *ctb, uint32_t hash)
> +{
> +    struct conn *conn = NULL;
> +
> +    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
> +        if (!conn_key_cmp(&conn->key, key)) {
> +            break;
> +        }
> +        if (!conn_key_cmp(&conn->rev_key, key)) {
> +            break;
> +        }
> +    }
> +    return conn;
> +}
> +
>  static void
>  conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
>                    long long now, int seq_skew, bool seq_skew_dir)
> @@ -823,6 +841,20 @@ conn_clean(struct conntrack *ct, struct conn *conn,
>      }
>  }
>
> +static void
> +conn_clean_safe(struct conntrack *ct, struct conn *conn,
> +                struct conntrack_bucket *ctb, uint32_t hash)
> +{
> +    ovs_mutex_lock(&ctb->cleanup_mutex);
> +    ct_lock_lock(&ctb->lock);
> +    conn = conn_lookup_any(&conn->key, ctb, hash);
> +    if (conn) {
> +        conn_clean(ct, conn, ctb);
> +    }
> +    ct_lock_unlock(&ctb->lock);
> +    ovs_mutex_unlock(&ctb->cleanup_mutex);
> +}
> +
>  static bool
>  ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
>  {
> @@ -969,6 +1001,7 @@ conn_update_state(struct conntrack *ct, struct
> dp_packet *pkt,
>      OVS_REQUIRES(ct->buckets[bucket].lock)
>  {
>      bool create_new_conn = false;
> +    struct conn lconn;
>
>      if (ctx->icmp_related) {
>          pkt->md.ct_state |= CS_RELATED;
> @@ -995,7 +1028,10 @@ conn_update_state(struct conntrack *ct, struct
> dp_packet *pkt,
>              pkt->md.ct_state = CS_INVALID;
>              break;
>          case CT_UPDATE_NEW:
> -            conn_clean(ct, *conn, &ct->buckets[bucket]);
> +            memcpy(&lconn, *conn, sizeof lconn);
> +            ct_lock_unlock(&ct->buckets[bucket].lock);
> +            conn_clean_safe(ct, &lconn, &ct->buckets[bucket], ctx->hash);
> +            ct_lock_lock(&ct->buckets[bucket].lock);
>              create_new_conn = true;
>              break;
>          default:
> @@ -1184,8 +1220,12 @@ process_one(struct conntrack *ct, struct dp_packet
> *pkt,
>      conn = ctx->conn;
>
>      /* Delete found entry if in wrong direction. 'force' implies commit.
> */
> -    if (conn && force && ctx->reply) {
> -        conn_clean(ct, conn, &ct->buckets[bucket]);
> +    if (OVS_LIKELY(force && ctx->reply && conn)) {
> +        struct conn lconn;
> +        memcpy(&lconn, conn, sizeof lconn);
> +        ct_lock_unlock(&ct->buckets[bucket].lock);
> +        conn_clean_safe(ct, &lconn, &ct->buckets[bucket], ctx->hash);
> +        ct_lock_lock(&ct->buckets[bucket].lock);
>          conn = NULL;
>      }
>
> @@ -1391,19 +1431,17 @@ sweep_bucket(struct conntrack *ct, struct
> conntrack_bucket *ctb,
>
>      for (unsigned i = 0; i < N_CT_TM; i++) {
>          LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
> -            if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> -                if (!conn_expired(conn, now) || count >= limit) {
> -                    min_expiration = MIN(min_expiration,
> conn->expiration);
> -                    if (count >= limit) {
> -                        /* Do not check other lists. */
> -                        COVERAGE_INC(conntrack_long_cleanup);
> -                        return min_expiration;
> -                    }
> -                    break;
> +            if (!conn_expired(conn, now) || count >= limit) {
> +                min_expiration = MIN(min_expiration, conn->expiration);
> +                if (count >= limit) {
> +                    /* Do not check other lists. */
> +                    COVERAGE_INC(conntrack_long_cleanup);
> +                    return min_expiration;
>                  }
> -                conn_clean(ct, conn, ctb);
> -                count++;
> +                break;
>              }
> +            conn_clean(ct, conn, ctb);
> +            count++;
>          }
>      }
>      return min_expiration;
> @@ -2547,16 +2585,19 @@ int
>  conntrack_flush(struct conntrack *ct, const uint16_t *zone)
>  {
>      for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
> -        struct conn *conn, *next;
> -
> -        ct_lock_lock(&ct->buckets[i].lock);
> -        HMAP_FOR_EACH_SAFE (conn, next, node,
> &ct->buckets[i].connections) {
> -            if ((!zone || *zone == conn->key.zone) &&
> -                (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
> -                conn_clean(ct, conn, &ct->buckets[i]);
> +        struct conntrack_bucket *ctb = &ct->buckets[i];
> +        ovs_mutex_lock(&ctb->cleanup_mutex);
> +        ct_lock_lock(&ctb->lock);
> +        for (unsigned j = 0; j < N_CT_TM; j++) {
> +            struct conn *conn, *next;
> +            LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[j])
> {
> +                if (!zone || *zone == conn->key.zone) {
> +                    conn_clean(ct, conn, ctb);
> +                }
>              }
>          }
> -        ct_lock_unlock(&ct->buckets[i].lock);
> +        ct_lock_unlock(&ctb->lock);
> +        ovs_mutex_unlock(&ctb->cleanup_mutex);
>      }
>
>      return 0;
> @@ -2573,16 +2614,19 @@ conntrack_flush_tuple(struct conntrack *ct, const
> struct ct_dpif_tuple *tuple,
>      tuple_to_conn_key(tuple, zone, &ctx.key);
>      ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
>      unsigned bucket = hash_to_bucket(ctx.hash);
> +    struct conntrack_bucket *ctb = &ct->buckets[bucket];
>
> -    ct_lock_lock(&ct->buckets[bucket].lock);
> -    conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
> +    ovs_mutex_lock(&ctb->cleanup_mutex);
> +    ct_lock_lock(&ctb->lock);
> +    conn_key_lookup(ctb, &ctx, time_msec());
>      if (ctx.conn && ctx.conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> -        conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
> +        conn_clean(ct, ctx.conn, ctb);
>      } else {
>          VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
>          error = ENOENT;
>      }
> -    ct_lock_unlock(&ct->buckets[bucket].lock);
> +    ct_lock_unlock(&ctb->lock);
> +    ovs_mutex_unlock(&ctb->cleanup_mutex);
>      return error;
>  }
>
> --
> 1.9.1
>
>

Patch
diff mbox series

diff --git a/lib/conntrack.c b/lib/conntrack.c
index 691782c..40a6021 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -753,6 +753,24 @@  conn_lookup(struct conntrack *ct, const struct conn_key *key, long long now)
     return ctx.conn;
 }
 
+/* This function is called with the bucket lock held. */
+static struct conn *
+conn_lookup_any(const struct conn_key *key,
+                const struct conntrack_bucket *ctb, uint32_t hash)
+{
+    struct conn *conn = NULL;
+
+    HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ctb->connections) {
+        if (!conn_key_cmp(&conn->key, key)) {
+            break;
+        }
+        if (!conn_key_cmp(&conn->rev_key, key)) {
+            break;
+        }
+    }
+    return conn;
+}
+
 static void
 conn_seq_skew_set(struct conntrack *ct, const struct conn_key *key,
                   long long now, int seq_skew, bool seq_skew_dir)
@@ -823,6 +841,20 @@  conn_clean(struct conntrack *ct, struct conn *conn,
     }
 }
 
+static void
+conn_clean_safe(struct conntrack *ct, struct conn *conn,
+                struct conntrack_bucket *ctb, uint32_t hash)
+{
+    ovs_mutex_lock(&ctb->cleanup_mutex);
+    ct_lock_lock(&ctb->lock);
+    conn = conn_lookup_any(&conn->key, ctb, hash);
+    if (conn) {
+        conn_clean(ct, conn, ctb);
+    }
+    ct_lock_unlock(&ctb->lock);
+    ovs_mutex_unlock(&ctb->cleanup_mutex);
+}
+
 static bool
 ct_verify_helper(const char *helper, enum ct_alg_ctl_type ct_alg_ctl)
 {
@@ -969,6 +1001,7 @@  conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
     OVS_REQUIRES(ct->buckets[bucket].lock)
 {
     bool create_new_conn = false;
+    struct conn lconn;
 
     if (ctx->icmp_related) {
         pkt->md.ct_state |= CS_RELATED;
@@ -995,7 +1028,10 @@  conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
             pkt->md.ct_state = CS_INVALID;
             break;
         case CT_UPDATE_NEW:
-            conn_clean(ct, *conn, &ct->buckets[bucket]);
+            memcpy(&lconn, *conn, sizeof lconn);
+            ct_lock_unlock(&ct->buckets[bucket].lock);
+            conn_clean_safe(ct, &lconn, &ct->buckets[bucket], ctx->hash);
+            ct_lock_lock(&ct->buckets[bucket].lock);
             create_new_conn = true;
             break;
         default:
@@ -1184,8 +1220,12 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
     conn = ctx->conn;
 
     /* Delete found entry if in wrong direction. 'force' implies commit. */
-    if (conn && force && ctx->reply) {
-        conn_clean(ct, conn, &ct->buckets[bucket]);
+    if (OVS_LIKELY(force && ctx->reply && conn)) {
+        struct conn lconn;
+        memcpy(&lconn, conn, sizeof lconn);
+        ct_lock_unlock(&ct->buckets[bucket].lock);
+        conn_clean_safe(ct, &lconn, &ct->buckets[bucket], ctx->hash);
+        ct_lock_lock(&ct->buckets[bucket].lock);
         conn = NULL;
     }
 
@@ -1391,19 +1431,17 @@  sweep_bucket(struct conntrack *ct, struct conntrack_bucket *ctb,
 
     for (unsigned i = 0; i < N_CT_TM; i++) {
         LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[i]) {
-            if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-                if (!conn_expired(conn, now) || count >= limit) {
-                    min_expiration = MIN(min_expiration, conn->expiration);
-                    if (count >= limit) {
-                        /* Do not check other lists. */
-                        COVERAGE_INC(conntrack_long_cleanup);
-                        return min_expiration;
-                    }
-                    break;
+            if (!conn_expired(conn, now) || count >= limit) {
+                min_expiration = MIN(min_expiration, conn->expiration);
+                if (count >= limit) {
+                    /* Do not check other lists. */
+                    COVERAGE_INC(conntrack_long_cleanup);
+                    return min_expiration;
                 }
-                conn_clean(ct, conn, ctb);
-                count++;
+                break;
             }
+            conn_clean(ct, conn, ctb);
+            count++;
         }
     }
     return min_expiration;
@@ -2547,16 +2585,19 @@  int
 conntrack_flush(struct conntrack *ct, const uint16_t *zone)
 {
     for (unsigned i = 0; i < CONNTRACK_BUCKETS; i++) {
-        struct conn *conn, *next;
-
-        ct_lock_lock(&ct->buckets[i].lock);
-        HMAP_FOR_EACH_SAFE (conn, next, node, &ct->buckets[i].connections) {
-            if ((!zone || *zone == conn->key.zone) &&
-                (conn->conn_type == CT_CONN_TYPE_DEFAULT)) {
-                conn_clean(ct, conn, &ct->buckets[i]);
+        struct conntrack_bucket *ctb = &ct->buckets[i];
+        ovs_mutex_lock(&ctb->cleanup_mutex);
+        ct_lock_lock(&ctb->lock);
+        for (unsigned j = 0; j < N_CT_TM; j++) {
+            struct conn *conn, *next;
+            LIST_FOR_EACH_SAFE (conn, next, exp_node, &ctb->exp_lists[j]) {
+                if (!zone || *zone == conn->key.zone) {
+                    conn_clean(ct, conn, ctb);
+                }
             }
         }
-        ct_lock_unlock(&ct->buckets[i].lock);
+        ct_lock_unlock(&ctb->lock);
+        ovs_mutex_unlock(&ctb->cleanup_mutex);
     }
 
     return 0;
@@ -2573,16 +2614,19 @@  conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
     tuple_to_conn_key(tuple, zone, &ctx.key);
     ctx.hash = conn_key_hash(&ctx.key, ct->hash_basis);
     unsigned bucket = hash_to_bucket(ctx.hash);
+    struct conntrack_bucket *ctb = &ct->buckets[bucket];
 
-    ct_lock_lock(&ct->buckets[bucket].lock);
-    conn_key_lookup(&ct->buckets[bucket], &ctx, time_msec());
+    ovs_mutex_lock(&ctb->cleanup_mutex);
+    ct_lock_lock(&ctb->lock);
+    conn_key_lookup(ctb, &ctx, time_msec());
     if (ctx.conn && ctx.conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-        conn_clean(ct, ctx.conn, &ct->buckets[bucket]);
+        conn_clean(ct, ctx.conn, ctb);
     } else {
         VLOG_WARN("Must flush tuple using the original pre-NATed tuple");
         error = ENOENT;
     }
-    ct_lock_unlock(&ct->buckets[bucket].lock);
+    ct_lock_unlock(&ctb->lock);
+    ovs_mutex_unlock(&ctb->cleanup_mutex);
     return error;
 }