diff mbox series

[ovs-dev,v1,5/9] conntrack: Inverse conn and ct lock precedence

Message ID 8da1b23f2a7bb2b8e216b845ecb022795f9ac7c4.1613557616.git.grive@u256.net
State Changes Requested
Headers show
Series conntrack: improve multithread scalability | expand

Commit Message

Gaetan Rivet Feb. 17, 2021, 4:34 p.m. UTC
The lock priority order is for the global 'ct_lock' to be taken first
and then 'conn->lock'. This is an issue, as multiple operations on
connections are thus blocked between threads contending on the
global 'ct_lock'.

This was previously necessary due to how the expiration lists, timeout
policies and zone limits were managed. They are now using RCU-friendly
structures that allow concurrent readers. The mutual exclusion now only
needs to happen during writes.

This allows reducing the 'ct_lock' precedence, and to only take it
when writing the relevant structures. This will reduce contention on
'ct_lock', which impairs scalability when the connection tracker is
used by many threads.

Signed-off-by: Gaetan Rivet <grive@u256.net>
Reviewed-by: Eli Britstein <elibr@nvidia.com>
---
 lib/conntrack-private.h | 16 +++++++++---
 lib/conntrack-tp.c      | 29 ++++++---------------
 lib/conntrack.c         | 58 +++++++++++++++++++++++++----------------
 3 files changed, 57 insertions(+), 46 deletions(-)

Comments

William Tu Feb. 23, 2021, 10:55 p.m. UTC | #1
On Wed, Feb 17, 2021 at 8:34 AM Gaetan Rivet <grive@u256.net> wrote:
>
> The lock priority order is for the global 'ct_lock' to be taken first
> and then 'conn->lock'. This is an issue, as multiple operations on
> connections are thus blocked between threads contending on the
> global 'ct_lock'.
>
> This was previously necessary due to how the expiration lists, timeout
> policies and zone limits were managed. They are now using RCU-friendly
> structures that allow concurrent readers. The mutual exclusion now only
> needs to happen during writes.
>
> This allows reducing the 'ct_lock' precedence, and to only take it
> when writing the relevant structures. This will reduce contention on
> 'ct_lock', which impairs scalability when the connection tracker is
> used by many threads.
>
> Signed-off-by: Gaetan Rivet <grive@u256.net>
> Reviewed-by: Eli Britstein <elibr@nvidia.com>
> ---

Thanks! it's pretty cool to see locks being removed or optimized.
The changes make sense to me. But maybe others should also
take a second look.


William
diff mbox series

Patch

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index 7eb555092..e8ffb5bf9 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -140,6 +140,9 @@  struct conn {
     struct nat_action_info_t *nat_info;
     char *alg;
     struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
+    atomic_flag reclaimed; /* False during the lifetime of the connection,
+                            * True as soon as a thread has started freeing
+                            * its memory. */
 
     /* Mutable data. */
     struct ovs_mutex lock; /* Guards all mutable fields. */
@@ -200,8 +203,8 @@  struct conntrack {
 };
 
 /* Lock acquisition order:
- *    1. 'ct_lock'
- *    2. 'conn->lock'
+ *    1. 'conn->lock'
+ *    2. 'ct_lock'
  *    3. 'resources_lock'
  */
 
@@ -222,11 +225,18 @@  struct ct_l4_proto {
 };
 
 static inline void
-conn_expire_remove(struct conn_expire *exp)
+conn_expire_remove(struct conn_expire *exp, bool need_ct_lock)
+    OVS_NO_THREAD_SAFETY_ANALYSIS
 {
     if (!atomic_flag_test_and_set(&exp->remove_once)
         && rculist_next(&exp->node)) {
+        if (need_ct_lock) {
+            ovs_mutex_lock(&exp->ct->ct_lock);
+        }
         rculist_remove(&exp->node);
+        if (need_ct_lock) {
+            ovs_mutex_unlock(&exp->ct->ct_lock);
+        }
     }
 }
 
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index a376857ec..01ffa30df 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -238,6 +238,7 @@  tm_to_ct_dpif_tp(enum ct_timeout tm)
 
 static void
 conn_expire_init(struct conn *conn, struct conntrack *ct)
+    OVS_REQUIRES(conn->lock)
 {
     struct conn_expire *exp = &conn->exp;
 
@@ -257,20 +258,22 @@  conn_expire_insert(struct conn *conn)
 {
     struct conn_expire *exp = &conn->exp;
 
-    ovs_mutex_lock(&exp->ct->ct_lock);
     ovs_mutex_lock(&conn->lock);
 
+    ovs_mutex_lock(&exp->ct->ct_lock);
     rculist_push_back(&exp->ct->exp_lists[exp->tm], &exp->node);
+    ovs_mutex_unlock(&exp->ct->ct_lock);
+
     atomic_flag_clear(&exp->insert_once);
     atomic_flag_clear(&exp->remove_once);
 
     ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&exp->ct->ct_lock);
 }
 
 static void
 conn_schedule_expiration(struct conntrack *ct, struct conn *conn,
                          enum ct_timeout tm, long long now, uint32_t tp_value)
+    OVS_REQUIRES(conn->lock)
 {
     conn_expire_init(conn, ct);
     conn->expiration = now + tp_value * 1000;
@@ -285,19 +288,12 @@  conn_update_expiration__(struct conntrack *ct, struct conn *conn,
                          enum ct_timeout tm, long long now,
                          uint32_t tp_value)
     OVS_REQUIRES(conn->lock)
+    OVS_EXCLUDED(ct->ct_lock)
 {
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
     if (!conn->cleaned) {
-        conn_expire_remove(&conn->exp);
+        conn_expire_remove(&conn->exp, true);
         conn_schedule_expiration(ct, conn, tm, now, tp_value);
     }
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
 }
 
 /* The conn entry lock must be held on entry and exit. */
@@ -309,20 +305,12 @@  conn_update_expiration(struct conntrack *ct, struct conn *conn,
     struct timeout_policy *tp;
     uint32_t val;
 
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
     tp = timeout_policy_lookup(ct, conn->tp_id);
     if (tp) {
         val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)];
     } else {
         val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)];
     }
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
     VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
                 "val=%u sec.",
                 ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
@@ -330,11 +318,10 @@  conn_update_expiration(struct conntrack *ct, struct conn *conn,
     conn_update_expiration__(ct, conn, tm, now, val);
 }
 
-/* ct_lock must be held. */
 void
 conn_init_expiration(struct conntrack *ct, struct conn *conn,
                      enum ct_timeout tm, long long now)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_REQUIRES(conn->lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 06b18f9fa..5aad64994 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -465,7 +465,7 @@  zone_limit_delete(struct conntrack *ct, uint16_t zone)
 
 static void
 conn_clean_cmn(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_REQUIRES(conn->lock, ct->ct_lock)
 {
     if (conn->alg) {
         expectation_clean(ct, &conn->key);
@@ -484,32 +484,52 @@  conn_clean_cmn(struct conntrack *ct, struct conn *conn)
  * removes the associated nat 'conn' from the lookup datastructures. */
 static void
 conn_clean(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
     ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
 
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+    ovs_mutex_lock(&ct->ct_lock);
+
     conn_clean_cmn(ct, conn);
     if (conn->nat_conn) {
         uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
         cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
     }
-    conn_expire_remove(&conn->exp);
+    conn_expire_remove(&conn->exp, false);
     conn->cleaned = true;
     ovsrcu_postpone(delete_conn, conn);
     atomic_count_dec(&ct->n_conn);
+
+    ovs_mutex_unlock(&ct->ct_lock);
+    ovs_mutex_unlock(&conn->lock);
 }
 
 static void
 conn_clean_one(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+    ovs_mutex_lock(&ct->ct_lock);
+
     conn_clean_cmn(ct, conn);
     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
-        conn_expire_remove(&conn->exp);
+        conn_expire_remove(&conn->exp, false);
         conn->cleaned = true;
         atomic_count_dec(&ct->n_conn);
     }
     ovsrcu_postpone(delete_conn_one, conn);
+
+    ovs_mutex_unlock(&ct->ct_lock);
+    ovs_mutex_unlock(&conn->lock);
 }
 
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
@@ -523,10 +543,12 @@  conntrack_destroy(struct conntrack *ct)
     pthread_join(ct->clean_thread, NULL);
     latch_destroy(&ct->clean_thread_exit);
 
-    ovs_mutex_lock(&ct->ct_lock);
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         conn_clean_one(ct, conn);
     }
+
+    ovs_mutex_lock(&ct->ct_lock);
+
     cmap_destroy(&ct->conns);
 
     struct zone_limit *zl;
@@ -1002,7 +1024,6 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
                const struct nat_action_info_t *nat_action_info,
                const char *helper, const struct alg_exp_node *alg_exp,
                enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
-    OVS_REQUIRES(ct->ct_lock)
 {
     struct conn *nc = NULL;
     struct conn *nat_conn = NULL;
@@ -1076,18 +1097,24 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
             nat_packet(pkt, nc, ctx->icmp_related);
             memcpy(&nat_conn->key, &nc->rev_key, sizeof nat_conn->key);
             memcpy(&nat_conn->rev_key, &nc->key, sizeof nat_conn->rev_key);
+            ovs_mutex_init_adaptive(&nat_conn->lock);
             nat_conn->conn_type = CT_CONN_TYPE_UN_NAT;
             nat_conn->nat_info = NULL;
             nat_conn->alg = NULL;
             nat_conn->nat_conn = NULL;
             uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
+            ovs_mutex_lock(&ct->ct_lock);
             cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
+            ovs_mutex_unlock(&ct->ct_lock);
         }
 
         nc->nat_conn = nat_conn;
         ovs_mutex_init_adaptive(&nc->lock);
         nc->conn_type = CT_CONN_TYPE_DEFAULT;
+        atomic_flag_clear(&nc->reclaimed);
+        ovs_mutex_lock(&ct->ct_lock);
         cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
+        ovs_mutex_unlock(&ct->ct_lock);
         atomic_count_inc(&ct->n_conn);
         ctx->conn = nc; /* For completeness. */
 
@@ -1110,7 +1137,7 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
      * can limit DoS impact. */
 nat_res_exhaustion:
     free(nat_conn);
-    conn_expire_remove(&nc->exp);
+    conn_expire_remove(&nc->exp, false);
     ovsrcu_postpone(delete_conn_cmn, nc);
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
     VLOG_WARN_RL(&rl, "Unable to NAT due to tuple space exhaustion - "
@@ -1150,11 +1177,9 @@  conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
             pkt->md.ct_state = CS_INVALID;
             break;
         case CT_UPDATE_NEW:
-            ovs_mutex_lock(&ct->ct_lock);
             if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
                 conn_clean(ct, conn);
             }
-            ovs_mutex_unlock(&ct->ct_lock);
             create_new_conn = true;
             break;
         case CT_UPDATE_VALID_NEW:
@@ -1336,11 +1361,9 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
 
     /* Delete found entry if in wrong direction. 'force' implies commit. */
     if (OVS_UNLIKELY(force && ctx->reply && conn)) {
-        ovs_mutex_lock(&ct->ct_lock);
         if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
             conn_clean(ct, conn);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
         conn = NULL;
     }
 
@@ -1404,12 +1427,10 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
         }
         ovs_rwlock_unlock(&ct->resources_lock);
 
-        ovs_mutex_lock(&ct->ct_lock);
         if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
             conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
                                   helper, alg_exp, ct_alg_ctl, tp_id);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
     }
 
     write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
@@ -1532,8 +1553,6 @@  ct_sweep(struct conntrack *ct, long long now, size_t limit)
     long long min_expiration = LLONG_MAX;
     size_t count = 0;
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         RCULIST_FOR_EACH (conn, exp.node, &ct->exp_lists[i]) {
             ovs_mutex_lock(&conn->lock);
@@ -1557,7 +1576,6 @@  ct_sweep(struct conntrack *ct, long long now, size_t limit)
 out:
     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
              time_msec() - now);
-    ovs_mutex_unlock(&ct->ct_lock);
     return min_expiration;
 }
 
@@ -2606,13 +2624,11 @@  conntrack_flush(struct conntrack *ct, const uint16_t *zone)
 {
     struct conn *conn;
 
-    ovs_mutex_lock(&ct->ct_lock);
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         if (!zone || *zone == conn->key.zone) {
             conn_clean_one(ct, conn);
         }
     }
-    ovs_mutex_unlock(&ct->ct_lock);
 
     return 0;
 }
@@ -2627,9 +2643,8 @@  conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
 
     memset(&key, 0, sizeof(key));
     tuple_to_conn_key(tuple, zone, &key);
-    ovs_mutex_lock(&ct->ct_lock);
-    conn_lookup(ct, &key, time_msec(), &conn, NULL);
 
+    conn_lookup(ct, &key, time_msec(), &conn, NULL);
     if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
         conn_clean(ct, conn);
     } else {
@@ -2637,7 +2652,6 @@  conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
         error = ENOENT;
     }
 
-    ovs_mutex_unlock(&ct->ct_lock);
     return error;
 }