diff mbox series

[ovs-dev,v3,7/8] conntrack: Inverse conn and ct lock precedence

Message ID 9ae8ad243da85be4853b90eccc958600dace7726.1623786081.git.grive@u256.net
State Changes Requested
Headers show
Series conntrack: improve multithread scalability | expand

Commit Message

Gaetan Rivet June 15, 2021, 11:22 p.m. UTC
The lock priority order is for the global 'ct_lock' to be taken first
and then 'conn->lock'. This is an issue, as multiple operations on
connections are thus blocked between threads contending on the
global 'ct_lock'.

This was previously necessary due to how the expiration lists, timeout
policies and zone limits were managed. They are now using RCU-friendly
structures that allow concurrent readers. The mutual exclusion now only
needs to happen during writes.

This allows reducing the 'ct_lock' precedence, and to only take it
when writing the relevant structures. This will reduce contention on
'ct_lock', which impairs scalability when the connection tracker is
used by many threads.

Signed-off-by: Gaetan Rivet <grive@u256.net>
Reviewed-by: Eli Britstein <elibr@nvidia.com>
---
 lib/conntrack-private.h |  7 ++++--
 lib/conntrack-tp.c      | 30 +---------------------
 lib/conntrack.c         | 56 +++++++++++++++++++++++++----------------
 3 files changed, 41 insertions(+), 52 deletions(-)

Comments

wenxu July 10, 2021, 2:55 p.m. UTC | #1
Hi Gaetan,


First, Thanks for your patch. This is very useful for us. But maybe 
there are some question need to be checked.


> 
>     ovs_mutex_unlock(&ct->ct_lock);
>@@ -1034,7 +1057,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>                const struct nat_action_info_t *nat_action_info,
>                const char *helper, const struct alg_exp_node *alg_exp,
>                enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
>-    OVS_REQUIRES(ct->ct_lock)
> {
>     struct conn *nc = NULL;
>     struct conn *nat_conn = NULL;
>@@ -1113,13 +1135,18 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
>             nat_conn->alg = NULL;
>             nat_conn->nat_conn = NULL;
>             uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
>+            ovs_mutex_lock(&ct->ct_lock);
>             cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
>+            ovs_mutex_unlock(&ct->ct_lock);
>         }
I think for nat case the port and ip selection function also should contain in the ct_lock
For snat example 10.0.0.2 and 10.0.0.3 both snat to 1.1.1.7, if both the 10.0.0.2 and
10.0.0.3 using the same port to access the same  client. In default both of them will 
select the original port. 
> 

>@@ -1434,12 +1457,10 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
>         }
>         ovs_rwlock_unlock(&ct->resources_lock);
> 
>-        ovs_mutex_lock(&ct->ct_lock);
>         if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
>             conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
>                                   helper, alg_exp, ct_alg_ctl, tp_id);
>         }
I think the conn_lookup should contained in the ct_lock.  If there are two packet create same conntrack
in different pmd,  At the same time both of them find no conntrack through lookup_conn. Then there will
be two conntracks insert to the systems.


>-        ovs_mutex_unlock(&ct->ct_lock);
>     }
> 
>     write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
>@@ -1564,8 +1585,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit)
>     struct mpsc_queue_node *node;
>     size_t count = 0;
> 
>-    ovs_mutex_lock(&ct->ct_lock);
>-
>     for (unsigned i = 0; i < N_CT_TM; i++) {
>         struct conn *end_of_queue = NULL;
> 
>@@ -1630,7 +1649,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit)
> out:
>     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
>              time_msec() - now);
>-    ovs_mutex_unlock(&ct->ct_lock);
>     return min_expiration;
> }
> 
>@@ -2688,13 +2706,11 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
> {
>     struct conn *conn;
> 
>-    ovs_mutex_lock(&ct->ct_lock);
>     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
>         if (!zone || *zone == conn->key.zone) {
>             conn_clean_one(ct, conn);
>         }
>     }
>-    ovs_mutex_unlock(&ct->ct_lock);
> 
>     return 0;
> }
>@@ -2709,7 +2725,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
> 
>     memset(&key, 0, sizeof(key));
>     tuple_to_conn_key(tuple, zone, &key);
>-    ovs_mutex_lock(&ct->ct_lock);
>     conn_lookup(ct, &key, time_msec(), &conn, NULL);
> 
>     if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
>@@ -2719,7 +2734,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
>         error = ENOENT;
>     }
> 
>-    ovs_mutex_unlock(&ct->ct_lock);
>     return error;
> }
> 
>-- 
>2.31.1
>
>_______________________________________________
>dev mailing list
>dev@openvswitch.org
>https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Gaetan Rivet Aug. 2, 2021, 5:53 p.m. UTC | #2
On Sat, Jul 10, 2021, at 16:55, wenxu wrote:
> Hi Gaetan,
> 
> First, Thanks for your patch. This is very useful for us. But maybe 
> there are some question need to be checked.
> 
> > 
> >     ovs_mutex_unlock(&ct->ct_lock);
> >@@ -1034,7 +1057,6 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> >                const struct nat_action_info_t *nat_action_info,
> >                const char *helper, const struct alg_exp_node *alg_exp,
> >                enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
> >-    OVS_REQUIRES(ct->ct_lock)
> > {
> >     struct conn *nc = NULL;
> >     struct conn *nat_conn = NULL;
> >@@ -1113,13 +1135,18 @@ conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
> >             nat_conn->alg = NULL;
> >             nat_conn->nat_conn = NULL;
> >             uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
> >+            ovs_mutex_lock(&ct->ct_lock);
> >             cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
> >+            ovs_mutex_unlock(&ct->ct_lock);
> >         }
> I think for nat case the port and ip selection function also should 
> contain in the ct_lock
> For snat example 10.0.0.2 and 10.0.0.3 both snat to 1.1.1.7, if both 
> the 10.0.0.2 and
> 10.0.0.3 using the same port to access the same  client. In default 
> both of them will 
> select the original port. 
> > 
> 
> >@@ -1434,12 +1457,10 @@ process_one(struct conntrack *ct, struct dp_packet *pkt,
> >         }
> >         ovs_rwlock_unlock(&ct->resources_lock);
> > 
> >-        ovs_mutex_lock(&ct->ct_lock);
> >         if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
> >             conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
> >                                   helper, alg_exp, ct_alg_ctl, tp_id);
> >         }
> I think the conn_lookup should contained in the ct_lock.  If there are 
> two packet create same conntrack
> in different pmd,  At the same time both of them find no conntrack 
> through lookup_conn. Then there will
> be two conntracks insert to the systems.
> 

Hi!

I think you are right. I was going too fast removing those locks.
When doing looking before insertion to avoid double-insertion, the
initial lookup must be part of the insertion critical section,
otherwise it will race.

I have fixed it. It has an impact on performance of course, but
there is no way to avoid it. I will send a new version soon.

Thanks for the review!

> >-        ovs_mutex_unlock(&ct->ct_lock);
> >     }
> > 
> >     write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
> >@@ -1564,8 +1585,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit)
> >     struct mpsc_queue_node *node;
> >     size_t count = 0;
> > 
> >-    ovs_mutex_lock(&ct->ct_lock);
> >-
> >     for (unsigned i = 0; i < N_CT_TM; i++) {
> >         struct conn *end_of_queue = NULL;
> > 
> >@@ -1630,7 +1649,6 @@ ct_sweep(struct conntrack *ct, long long now, size_t limit)
> > out:
> >     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
> >              time_msec() - now);
> >-    ovs_mutex_unlock(&ct->ct_lock);
> >     return min_expiration;
> > }
> > 
> >@@ -2688,13 +2706,11 @@ conntrack_flush(struct conntrack *ct, const uint16_t *zone)
> > {
> >     struct conn *conn;
> > 
> >-    ovs_mutex_lock(&ct->ct_lock);
> >     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
> >         if (!zone || *zone == conn->key.zone) {
> >             conn_clean_one(ct, conn);
> >         }
> >     }
> >-    ovs_mutex_unlock(&ct->ct_lock);
> > 
> >     return 0;
> > }
> >@@ -2709,7 +2725,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
> > 
> >     memset(&key, 0, sizeof(key));
> >     tuple_to_conn_key(tuple, zone, &key);
> >-    ovs_mutex_lock(&ct->ct_lock);
> >     conn_lookup(ct, &key, time_msec(), &conn, NULL);
> > 
> >     if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
> >@@ -2719,7 +2734,6 @@ conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
> >         error = ENOENT;
> >     }
> > 
> >-    ovs_mutex_unlock(&ct->ct_lock);
> >     return error;
> > }
> > 
> >-- 
> >2.31.1
> >
> >_______________________________________________
> >dev mailing list
> >dev@openvswitch.org
> >https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
diff mbox series

Patch

diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h
index ea2e7ed4d..bb82252e8 100644
--- a/lib/conntrack-private.h
+++ b/lib/conntrack-private.h
@@ -134,6 +134,9 @@  struct conn {
     struct nat_action_info_t *nat_info;
     char *alg;
     struct conn *nat_conn; /* The NAT 'conn' context, if there is one. */
+    atomic_flag reclaimed; /* False during the lifetime of the connection,
+                            * True as soon as a thread has started freeing
+                            * its memory. */
 
     /* Inserted once by a PMD, then managed by the 'ct_clean' thread. */
     struct conn_expire exp;
@@ -196,8 +199,8 @@  struct conntrack {
 };
 
 /* Lock acquisition order:
- *    1. 'ct_lock'
- *    2. 'conn->lock'
+ *    1. 'conn->lock'
+ *    2. 'ct_lock'
  *    3. 'resources_lock'
  */
 
diff --git a/lib/conntrack-tp.c b/lib/conntrack-tp.c
index 592e10c6f..22363e7fe 100644
--- a/lib/conntrack-tp.c
+++ b/lib/conntrack-tp.c
@@ -245,58 +245,30 @@  conn_schedule_expiration(struct conn *conn, enum ct_timeout tm, long long now,
     ignore(atomic_flag_test_and_set(&conn->exp.reschedule));
 }
 
-static void
-conn_update_expiration__(struct conntrack *ct, struct conn *conn,
-                         enum ct_timeout tm, long long now,
-                         uint32_t tp_value)
-    OVS_REQUIRES(conn->lock)
-{
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
-    conn_schedule_expiration(conn, tm, now, tp_value);
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
-}
-
 /* The conn entry lock must be held on entry and exit. */
 void
 conn_update_expiration(struct conntrack *ct, struct conn *conn,
                        enum ct_timeout tm, long long now)
-    OVS_REQUIRES(conn->lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
 
-    ovs_mutex_unlock(&conn->lock);
-
-    ovs_mutex_lock(&ct->ct_lock);
-    ovs_mutex_lock(&conn->lock);
     tp = timeout_policy_lookup(ct, conn->tp_id);
     if (tp) {
         val = tp->policy.attrs[tm_to_ct_dpif_tp(tm)];
     } else {
         val = ct_dpif_netdev_tp_def[tm_to_ct_dpif_tp(tm)];
     }
-    ovs_mutex_unlock(&conn->lock);
-    ovs_mutex_unlock(&ct->ct_lock);
-
-    ovs_mutex_lock(&conn->lock);
     VLOG_DBG_RL(&rl, "Update timeout %s zone=%u with policy id=%d "
                 "val=%u sec.",
                 ct_timeout_str[tm], conn->key.zone, conn->tp_id, val);
 
-    conn_update_expiration__(ct, conn, tm, now, val);
+    conn_schedule_expiration(conn, tm, now, val);
 }
 
-/* ct_lock must be held. */
 void
 conn_init_expiration(struct conntrack *ct, struct conn *conn,
                      enum ct_timeout tm, long long now)
-    OVS_REQUIRES(ct->ct_lock)
 {
     struct timeout_policy *tp;
     uint32_t val;
diff --git a/lib/conntrack.c b/lib/conntrack.c
index 71f51f3d9..045710e8d 100644
--- a/lib/conntrack.c
+++ b/lib/conntrack.c
@@ -465,7 +465,7 @@  zone_limit_delete(struct conntrack *ct, uint16_t zone)
 
 static void
 conn_clean_cmn(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_REQUIRES(conn->lock, ct->ct_lock)
 {
     if (conn->alg) {
         expectation_clean(ct, &conn->key);
@@ -495,18 +495,29 @@  conn_unref(struct conn *conn)
  * removes the associated nat 'conn' from the lookup datastructures. */
 static void
 conn_clean(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
     ovs_assert(conn->conn_type == CT_CONN_TYPE_DEFAULT);
 
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+
+    ovs_mutex_lock(&ct->ct_lock);
     conn_clean_cmn(ct, conn);
     if (conn->nat_conn) {
         uint32_t hash = conn_key_hash(&conn->nat_conn->key, ct->hash_basis);
         cmap_remove(&ct->conns, &conn->nat_conn->cm_node, hash);
     }
+    ovs_mutex_unlock(&ct->ct_lock);
+
     conn->cleaned = true;
     conn_unref(conn);
     atomic_count_dec(&ct->n_conn);
+
+    ovs_mutex_unlock(&conn->lock);
 }
 
 static inline bool
@@ -521,14 +532,25 @@  conn_unref_one(struct conn *conn)
 
 static void
 conn_clean_one(struct conntrack *ct, struct conn *conn)
-    OVS_REQUIRES(ct->ct_lock)
+    OVS_EXCLUDED(conn->lock, ct->ct_lock)
 {
+    if (atomic_flag_test_and_set(&conn->reclaimed)) {
+        return;
+    }
+
+    ovs_mutex_lock(&conn->lock);
+
+    ovs_mutex_lock(&ct->ct_lock);
     conn_clean_cmn(ct, conn);
+    ovs_mutex_unlock(&ct->ct_lock);
+
     if (conn->conn_type == CT_CONN_TYPE_DEFAULT) {
         conn->cleaned = true;
         atomic_count_dec(&ct->n_conn);
     }
     conn_unref_one(conn);
+
+    ovs_mutex_unlock(&conn->lock);
 }
 
 /* Destroys the connection tracker 'ct' and frees all the allocated memory.
@@ -542,8 +564,6 @@  conntrack_destroy(struct conntrack *ct)
     pthread_join(ct->clean_thread, NULL);
     latch_destroy(&ct->clean_thread_exit);
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         struct mpsc_queue_node *node;
 
@@ -559,7 +579,6 @@  conntrack_destroy(struct conntrack *ct)
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         conn_clean_one(ct, conn);
     }
-    cmap_destroy(&ct->conns);
 
     struct zone_limit *zl;
     CMAP_FOR_EACH (zl, node, &ct->zone_limits) {
@@ -568,7 +587,6 @@  conntrack_destroy(struct conntrack *ct)
         cmap_remove(&ct->zone_limits, &zl->node, hash);
         ovsrcu_postpone(free, zl);
     }
-    cmap_destroy(&ct->zone_limits);
 
     struct timeout_policy *tp;
     CMAP_FOR_EACH (tp, node, &ct->timeout_policies) {
@@ -577,6 +595,11 @@  conntrack_destroy(struct conntrack *ct)
         cmap_remove(&ct->timeout_policies, &tp->node, hash);
         ovsrcu_postpone(free, tp);
     }
+
+    ovs_mutex_lock(&ct->ct_lock);
+
+    cmap_destroy(&ct->conns);
+    cmap_destroy(&ct->zone_limits);
     cmap_destroy(&ct->timeout_policies);
 
     ovs_mutex_unlock(&ct->ct_lock);
@@ -1034,7 +1057,6 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
                const struct nat_action_info_t *nat_action_info,
                const char *helper, const struct alg_exp_node *alg_exp,
                enum ct_alg_ctl_type ct_alg_ctl, uint32_t tp_id)
-    OVS_REQUIRES(ct->ct_lock)
 {
     struct conn *nc = NULL;
     struct conn *nat_conn = NULL;
@@ -1113,13 +1135,18 @@  conn_not_found(struct conntrack *ct, struct dp_packet *pkt,
             nat_conn->alg = NULL;
             nat_conn->nat_conn = NULL;
             uint32_t nat_hash = conn_key_hash(&nat_conn->key, ct->hash_basis);
+            ovs_mutex_lock(&ct->ct_lock);
             cmap_insert(&ct->conns, &nat_conn->cm_node, nat_hash);
+            ovs_mutex_unlock(&ct->ct_lock);
         }
 
         nc->nat_conn = nat_conn;
         ovs_mutex_init_adaptive(&nc->lock);
         nc->conn_type = CT_CONN_TYPE_DEFAULT;
+        atomic_flag_clear(&nc->reclaimed);
+        ovs_mutex_lock(&ct->ct_lock);
         cmap_insert(&ct->conns, &nc->cm_node, ctx->hash);
+        ovs_mutex_unlock(&ct->ct_lock);
         conn_expire_push_back(ct, nc);
         atomic_count_inc(&ct->n_conn);
         ctx->conn = nc; /* For completeness. */
@@ -1180,11 +1207,9 @@  conn_update_state(struct conntrack *ct, struct dp_packet *pkt,
             pkt->md.ct_state = CS_INVALID;
             break;
         case CT_UPDATE_NEW:
-            ovs_mutex_lock(&ct->ct_lock);
             if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
                 conn_clean(ct, conn);
             }
-            ovs_mutex_unlock(&ct->ct_lock);
             create_new_conn = true;
             break;
         case CT_UPDATE_VALID_NEW:
@@ -1366,11 +1391,9 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
 
     /* Delete found entry if in wrong direction. 'force' implies commit. */
     if (OVS_UNLIKELY(force && ctx->reply && conn)) {
-        ovs_mutex_lock(&ct->ct_lock);
         if (conn_lookup(ct, &conn->key, now, NULL, NULL)) {
             conn_clean(ct, conn);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
         conn = NULL;
     }
 
@@ -1434,12 +1457,10 @@  process_one(struct conntrack *ct, struct dp_packet *pkt,
         }
         ovs_rwlock_unlock(&ct->resources_lock);
 
-        ovs_mutex_lock(&ct->ct_lock);
         if (!conn_lookup(ct, &ctx->key, now, NULL, NULL)) {
             conn = conn_not_found(ct, pkt, ctx, commit, now, nat_action_info,
                                   helper, alg_exp, ct_alg_ctl, tp_id);
         }
-        ovs_mutex_unlock(&ct->ct_lock);
     }
 
     write_ct_md(pkt, zone, conn, &ctx->key, alg_exp);
@@ -1564,8 +1585,6 @@  ct_sweep(struct conntrack *ct, long long now, size_t limit)
     struct mpsc_queue_node *node;
     size_t count = 0;
 
-    ovs_mutex_lock(&ct->ct_lock);
-
     for (unsigned i = 0; i < N_CT_TM; i++) {
         struct conn *end_of_queue = NULL;
 
@@ -1630,7 +1649,6 @@  ct_sweep(struct conntrack *ct, long long now, size_t limit)
 out:
     VLOG_DBG("conntrack cleanup %"PRIuSIZE" entries in %lld msec", count,
              time_msec() - now);
-    ovs_mutex_unlock(&ct->ct_lock);
     return min_expiration;
 }
 
@@ -2688,13 +2706,11 @@  conntrack_flush(struct conntrack *ct, const uint16_t *zone)
 {
     struct conn *conn;
 
-    ovs_mutex_lock(&ct->ct_lock);
     CMAP_FOR_EACH (conn, cm_node, &ct->conns) {
         if (!zone || *zone == conn->key.zone) {
             conn_clean_one(ct, conn);
         }
     }
-    ovs_mutex_unlock(&ct->ct_lock);
 
     return 0;
 }
@@ -2709,7 +2725,6 @@  conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
 
     memset(&key, 0, sizeof(key));
     tuple_to_conn_key(tuple, zone, &key);
-    ovs_mutex_lock(&ct->ct_lock);
     conn_lookup(ct, &key, time_msec(), &conn, NULL);
 
     if (conn && conn->conn_type == CT_CONN_TYPE_DEFAULT) {
@@ -2719,7 +2734,6 @@  conntrack_flush_tuple(struct conntrack *ct, const struct ct_dpif_tuple *tuple,
         error = ENOENT;
     }
 
-    ovs_mutex_unlock(&ct->ct_lock);
     return error;
 }